READ THE DATA
library(dplyr)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.12.8 using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
library(mltools)
chicago_crime <- read.table(file = "chicago_crime_clean.csv", #Name of text file.
sep = ",", #Separation character.
header = TRUE, #If column names are in the first row.
na.strings = "NA", #Character to be marked as missing value.
stringsAsFactors = FALSE)
chicago_crime$location_description <- (gsub(","," ",chicago_crime$location_description))
chicago_crime$description <- gsub(":=","",chicago_crime$description)
chicago_crime$description <- gsub(":","",chicago_crime$description)
chicago_crime$description <- gsub("MANU/POSS. W/","",chicago_crime$description)
chicago_crime$description <- gsub(",","",chicago_crime$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime$location_description <- gsub("(E.G. UBER LYFT)","",chicago_crime$location_description)
chicago_crime$location_description <- gsub(",","",chicago_crime$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)
chicago_crime <- chicago_crime %>%
dplyr::mutate(year = lubridate::year(date),
month = lubridate::month(date),
day = lubridate::day(date))
chicago_crime <- na.omit(chicago_crime)
chicago_crime <- select(chicago_crime,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))
chicago_crime$district <- factor(chicago_crime$district)
unique(chicago_crime$primary_type)
[1] "ASSAULT" "OTHER OFFENSE" "NARCOTICS"
[4] "DECEPTIVE PRACTICE" "CRIMINAL TRESPASS" "WEAPONS VIOLATION"
[7] "CRIM SEXUAL ASSAULT" "BURGLARY" "MOTOR VEHICLE THEFT"
[10] "KIDNAPPING" "PUBLIC PEACE VIOLATION" "INTERFERENCE WITH PUBLIC OFFICER"
[13] "BATTERY" "GAMBLING" "ROBBERY"
[16] "OFFENSE INVOLVING CHILDREN" "SEX OFFENSE" "THEFT"
[19] "CONCEALED CARRY LICENSE VIOLATION" "CRIMINAL DAMAGE" "ARSON"
[22] "HOMICIDE" "LIQUOR LAW VIOLATION" "STALKING"
[25] "INTIMIDATION" "PROSTITUTION" "HUMAN TRAFFICKING"
[28] "OBSCENITY" "OTHER NARCOTIC VIOLATION" "PUBLIC INDECENCY"
[31] "NON-CRIMINAL"
head(chicago_crime)
summary(chicago_crime)
case_number block primary_type description location_description arrest district
Length:213687 Length:213687 Length:213687 Length:213687 Length:213687 Length:213687 11 : 14869
Class :character Class :character Class :character Class :character Class :character Class :character 18 : 14243
Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character 1 : 14227
6 : 12717
8 : 12577
12 : 11592
(Other):133462
ward latitude longitude month day
Min. : 1.00 Min. :36.62 Min. :-91.69 Min. : 1.000 Min. : 1.0
1st Qu.:10.00 1st Qu.:41.77 1st Qu.:-87.71 1st Qu.: 4.000 1st Qu.: 8.0
Median :24.00 Median :41.87 Median :-87.66 Median : 7.000 Median :15.0
Mean :23.83 Mean :41.85 Mean :-87.67 Mean : 6.575 Mean :15.5
3rd Qu.:36.00 3rd Qu.:41.91 3rd Qu.:-87.63 3rd Qu.: 9.000 3rd Qu.:23.0
Max. :50.00 Max. :42.02 Max. :-87.52 Max. :12.000 Max. :31.0
Read the training set
library(dplyr)
library(data.table)
library(mltools)
chicago_crime_tr <- read.table(file = "chicago_crime_tr.csv", #Name of text file.
sep = ",", #Separation character.
header = TRUE, #If column names are in the first row.
na.strings = "NA", #Character to be marked as missing value.
stringsAsFactors = FALSE)
chicago_crime_tr$location_description <- (gsub(","," ",chicago_crime_tr$location_description))
chicago_crime_tr$description <- gsub(":=","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(":","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub("MANU/POSS. W/","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(",","",chicago_crime_tr$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime_tr$location_description <- gsub("(E.G. UBER LYFT)","",chicago_crime_tr$location_description)
chicago_crime_tr$location_description <- gsub(",","",chicago_crime_tr$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)
chicago_crime_tr <- chicago_crime_tr %>%
dplyr::mutate(year = lubridate::year(date),
month = lubridate::month(date),
day = lubridate::day(date))
chicago_crime_tr <- select(chicago_crime_tr,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))
chicago_crime_tr <- na.omit(chicago_crime_tr)
chicago_crime_tr$district <- factor(chicago_crime_tr$district)
unique(chicago_crime_tr$primary_type)
[1] "INTERFERENCE WITH PUBLIC OFFICER" "OTHER OFFENSE" "DECEPTIVE PRACTICE"
[4] "SEX OFFENSE" "CRIM SEXUAL ASSAULT" "BATTERY"
[7] "CRIMINAL TRESPASS" "MOTOR VEHICLE THEFT" "THEFT"
[10] "ASSAULT" "NARCOTICS" "ROBBERY"
[13] "PUBLIC PEACE VIOLATION" "WEAPONS VIOLATION" "STALKING"
[16] "OFFENSE INVOLVING CHILDREN" "ARSON" "BURGLARY"
[19] "CRIMINAL DAMAGE" "HOMICIDE" "INTIMIDATION"
[22] "KIDNAPPING" "HUMAN TRAFFICKING" "PROSTITUTION"
[25] "OBSCENITY" "CONCEALED CARRY LICENSE VIOLATION" "CRIMINAL SEXUAL ASSAULT"
[28] "NON-CRIMINAL" "PUBLIC INDECENCY" "LIQUOR LAW VIOLATION"
[31] "GAMBLING" "OTHER NARCOTIC VIOLATION" "NON - CRIMINAL"
[34] "NON-CRIMINAL (SUBJECT SPECIFIED)"
head(chicago_crime_tr)
summary(chicago_crime_tr)
case_number block primary_type description location_description arrest district
Length:666666 Length:666666 Length:666666 Length:666666 Length:666666 Length:666666 11 : 45269
Class :character Class :character Class :character Class :character Class :character Class :character 1 : 42067
Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character 18 : 41971
8 : 41215
6 : 37548
12 : 35981
(Other):422615
ward latitude longitude month day
Min. : 1.00 Min. :41.64 Min. :-87.93 Min. : 1.000 Min. : 1.00
1st Qu.:11.00 1st Qu.:41.77 1st Qu.:-87.71 1st Qu.: 4.000 1st Qu.: 8.00
Median :24.00 Median :41.87 Median :-87.66 Median : 7.000 Median :16.00
Mean :23.76 Mean :41.85 Mean :-87.67 Mean : 6.594 Mean :15.72
3rd Qu.:36.00 3rd Qu.:41.91 3rd Qu.:-87.63 3rd Qu.: 9.000 3rd Qu.:23.00
Max. :50.00 Max. :42.02 Max. :-87.52 Max. :12.000 Max. :31.00
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime_tr$primary_type <- factor(chicago_crime_tr$primary_type)
chicago_crime_subset_tr <- subset(chicago_crime_tr, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )
chicago_crime_subset_tr$primary_type <- factor(chicago_crime_subset_tr$primary_type)
chicago_crime_subset_tr <- na.omit(chicago_crime_subset_tr)
library(DataExplorer)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
plot_str(chicago_crime_subset_tr)
plot_missing(chicago_crime_subset_tr)

#plot_histogram(chicago_crime_subset)
#plot_density(chicago_crime_subset)
#plot_correlation(chicago_numeric, type = 'continuous')
chicago_crime_subset_tr$month <- as.factor(chicago_crime_subset_tr$month)
plot_bar(chicago_crime_subset_tr)
4 columns ignored with more than 50 categories.
case_number: 610345 categories
block: 31746 categories
description: 210 categories
location_description: 162 categories

EXPLORATORY ANALYSIS
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.2.1 [32m✓[30m [34mpurrr [30m 0.3.3
[32m✓[30m [34mtibble [30m 2.1.3 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mtidyr [30m 1.0.2 [32m✓[30m [34mforcats[30m 0.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdata.table[30m::[32mbetween()[30m masks [34mdplyr[30m::between()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdata.table[30m::[32mfirst()[30m masks [34mdplyr[30m::first()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mdata.table[30m::[32mlast()[30m masks [34mdplyr[30m::last()
[31mx[30m [34mtidyr[30m::[32mreplace_na()[30m masks [34mmltools[30m::replace_na()
[31mx[30m [34mpurrr[30m::[32mtranspose()[30m masks [34mdata.table[30m::transpose()[39m
ggplot(data = chicago_crime) +
geom_bar(mapping = aes(x = primary_type)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

chicago_crime %>%
count(primary_type)
ggplot(data = chicago_crime) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1))

chicago_crime %>%
count(district)
ggplot(data = chicago_crime) +
geom_bar(mapping = aes(x = arrest)) +
theme(axis.text.x = element_text(hjust = 1))

chicago_crime %>%
count(arrest)
#chicago_crime$primary_type <- as.character(junk$nm)
chicago_crime$primary_type[chicago_crime$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime$primary_type[chicago_crime$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime$primary_type <- factor(chicago_crime$primary_type)
ggplot(data = chicago_crime) +
geom_bar(mapping = aes(x = primary_type)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

chicago_crime %>%
count(primary_type)
chicago_crime_subset <- subset(chicago_crime, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )
chicago_crime_subset$primary_type <- factor(chicago_crime_subset$primary_type)
ggplot(data = chicago_crime_subset) +
geom_bar(mapping = aes(x = primary_type)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
geom_count(mapping = aes(x = primary_type, y = district)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
geom_count(mapping = aes(x = arrest, y = primary_type)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
geom_count(mapping = aes(x = arrest, y = district)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

EXPLORATORY ANALYSIS BY CRIME
assault <- subset(chicago_crime_subset, primary_type=="ASSAULT")
violent_crime <- subset(chicago_crime_subset, primary_type=="VIOLENT CRIME")
theft <- subset(chicago_crime_subset, primary_type=="THEFT")
narcotics <- subset(chicago_crime_subset, primary_type=="NARCOTICS")
weapons_violation <- subset(chicago_crime_subset, primary_type=="WEAPONS VIOLATION")
robbery <- subset(chicago_crime_subset, primary_type=="ROBBERY")
criminal_damage <- subset(chicago_crime_subset, primary_type=="CRIMINAL DAMAGE")
deceptive_practice <- subset(chicago_crime_subset, primary_type=="DECEPTIVE PRACTICE")
assault_tr <- subset(chicago_crime_subset_tr, primary_type=="ASSAULT")
violent_tr_crime <- subset(chicago_crime_subset_tr, primary_type=="VIOLENT CRIME")
theft_tr <- subset(chicago_crime_subset_tr, primary_type=="THEFT")
narcotics_tr <- subset(chicago_crime_subset_tr, primary_type=="NARCOTICS")
weapons_violation_tr <- subset(chicago_crime_subset_tr, primary_type=="WEAPONS VIOLATION")
robbery_tr <- subset(chicago_crime_subset_tr, primary_type=="ROBBERY")
criminal_damage_tr <- subset(chicago_crime_subset_tr, primary_type=="CRIMINAL DAMAGE")
deceptive_practice_tr <- subset(chicago_crime_subset_tr, primary_type=="DECEPTIVE PRACTICE")
DISTRICTS
library(sqldf)
Loading required package: gsubfn
Loading required package: proto
unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
Reason: image not foundCould not load tcltk. Will use slower R code instead.
Loading required package: RSQLite
districts_true <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as arrest FROM chicago_crime_subset WHERE arrest LIKE "True" GROUP BY district ORDER BY district')
districts_false <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as no_arrest FROM chicago_crime_subset WHERE arrest LIKE "False" GROUP BY district ORDER BY district')
districts_true$arrest <- as.numeric(districts_true$arrest)
districts_false$no_arrest <- as.numeric(districts_false$no_arrest)
districts_true
districts_false
police_districts <- read.table(file = "Police_Stations.csv", #Name of text file.
sep = ",", #Separation character.
header = TRUE, #If column names are in the first row.
na.strings = "NA", #Character to be marked as missing value.
stringsAsFactors = FALSE)
police_districts
police_districts$DISTRICT[police_districts$DISTRICT == "Headquarters"] <- "0"
police_districts$DISTRICT <- as.factor(police_districts$DISTRICT)
districts <- sqldf('SELECT DISTRICT as district, LATITUDE as latitude,LONGITUDE as longitude FROM police_districts')
arrest_percentage <- data.frame('District' = districts_false$district, 'PctArrest' = districts_true$arrest/(districts_true$arrest + districts_false$no_arrest), 'Crimes' = (districts_true$arrest + districts_false$no_arrest))
arrest_percentage
ggplot(data = arrest_percentage) +
geom_col(mapping = aes(x = District, y = Crimes)) +
geom_line(aes(x = District, y = PctArrest*10000, group = 1), color = "yellow") +
scale_y_continuous(sec.axis = sec_axis(~./10000, name = "PctArrest")) +
theme(axis.text.x = element_text(hjust = 1))

## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
rgdal: version: 1.4-8, (SVN revision 845)
Geospatial Data Abstraction Library extensions to R successfully loaded
Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
GDAL binary built with GEOS: FALSE
Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
Linking to sp version: 1.3-2
# library("maptools")
library("KernSmooth")
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
setDT(districts_false)
#devtools::install_github("dkahle/ggmap", ref = "tidyup", force = TRUE)
library(ggmap)
Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
Please cite ggmap if you use it! See citation("ggmap") for details.
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949,
right = -87.2713, top = 42.0677),
zoom = 11)
Source : http://tile.stamen.com/terrain/11/523/759.png
Source : http://tile.stamen.com/terrain/11/524/759.png
Source : http://tile.stamen.com/terrain/11/525/759.png
Source : http://tile.stamen.com/terrain/11/526/759.png
Source : http://tile.stamen.com/terrain/11/527/759.png
Source : http://tile.stamen.com/terrain/11/523/760.png
Source : http://tile.stamen.com/terrain/11/524/760.png
Source : http://tile.stamen.com/terrain/11/525/760.png
Source : http://tile.stamen.com/terrain/11/526/760.png
Source : http://tile.stamen.com/terrain/11/527/760.png
Source : http://tile.stamen.com/terrain/11/523/761.png
Source : http://tile.stamen.com/terrain/11/524/761.png
Source : http://tile.stamen.com/terrain/11/525/761.png
Source : http://tile.stamen.com/terrain/11/526/761.png
Source : http://tile.stamen.com/terrain/11/527/761.png
Source : http://tile.stamen.com/terrain/11/523/762.png
Source : http://tile.stamen.com/terrain/11/524/762.png
Source : http://tile.stamen.com/terrain/11/525/762.png
Source : http://tile.stamen.com/terrain/11/526/762.png
Source : http://tile.stamen.com/terrain/11/527/762.png
Source : http://tile.stamen.com/terrain/11/523/763.png
Source : http://tile.stamen.com/terrain/11/524/763.png
Source : http://tile.stamen.com/terrain/11/525/763.png
Source : http://tile.stamen.com/terrain/11/526/763.png
Source : http://tile.stamen.com/terrain/11/527/763.png
ggmap(chicago) +
geom_text(aes(x = longitude, y = latitude, label = district), data = districts)

library(ggmap)
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949,
right = -87.2713, top = 42.0677),
zoom = 11)
ggmap(chicago) +
geom_text(aes(x = LONGITUDE, y = LATITUDE, label = DISTRICT), data = police_districts)

ggplot(data = assault) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("ASSAULT BY DISTRICT")

ggplot(data = theft) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("THEFTS BY DISTRICT")

ggplot(data = violent_crime) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("VIOLENT CRIMES BY DISTRICT")

ggplot(data = narcotics) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("NARCOTIC CRIMES BY DISTRICT")

ggplot(data = weapons_violation) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("WEAPON-RELATED CRIMES BY DISTRICT")

ggplot(data = robbery) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("ROBBERIES BY DISTRICT")

ggplot(data = criminal_damage) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("CRIMINAL DAMAGE CRIMES BY DISTRICT")

ggplot(data = deceptive_practice) +
geom_bar(mapping = aes(x = district)) +
theme(axis.text.x = element_text(hjust = 1)) +
ggtitle("DECEPTIVE PRACTICE CRIMES BY DISTRICT")

library(ggplot2)
ggplot(data = chicago_crime_subset, aes(x=primary_type, y=district, fill=arrest)) +
geom_tile() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Correlation
library(ggplot2)
ggplot(chicago_crime_subset,aes(x=district,y=primary_type,color=arrest))+geom_point(alpha=0.5)

Association Rules
chicago_crime_subset_2 <- subset(chicago_crime_subset, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_subset_2 <- subset(chicago_crime_subset_2, select=-c(location_description))
write.csv(chicago_crime_subset_2,"chicago_crime_AR.csv", quote = FALSE, row.names = FALSE)
library(arules)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Attaching package: ‘arules’
The following object is masked from ‘package:dplyr’:
recode
The following objects are masked from ‘package:base’:
abbreviate, write
crime_transactions <- read.transactions("chicago_crime_AR.csv", sep=",")
#deceptive_practice_2 <- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#write.csv(deceptive_practice_2,"deceptive_practice.csv", quote = FALSE, row.names = FALSE)
#dp_transactions <- read.transactions("deceptive_practice.csv", sep=",")
if (!require("RColorBrewer")) {
# install color package of R
install.packages("RColorBrewer")
#include library RColorBrewer
library(RColorBrewer)
}
Loading required package: RColorBrewer
itemFrequencyPlot(crime_transactions,topN=20,type="absolute",
col=brewer.pal(8,'Pastel2'),
main="Absolute Item Frequency Plot")

Reglas de Asociacion General
# Rule GENERATION
association.rules.clean <- apriori(crime_transactions, parameter = list(supp=0.001, conf=0.7))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [179 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
subset.rules.clean <- which(colSums(is.subset(association.rules.clean, association.rules.clean)) > 1)
subset.association.rules.clean. <- association.rules.clean[-subset.rules.clean]
inspect(subset.association.rules.clean.)
rules_by_count <- sort(association.rules.clean, by = "count")
rules_by_conf <- sort(association.rules.clean, by = "confidence")
rules_by_supp <- sort(association.rules.clean, by = "lift")
inspect(rules_by_count)
inspect(rules_by_conf)
inspect(rules_by_supp)
# Rule GENERATION
assault.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.001, conf=0.1),
appearance = list(default="lhs",rhs="ASSAULT"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [7 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
assault.subset.rules <- which(colSums(is.subset(assault.association.rules, assault.association.rules)) > 1) # get subset rules in vector
assault.subset.association.rules. <- assault.association.rules[-assault.subset.rules] # remove subset rules.
inspect(assault.subset.association.rules.)
as_by_count <- sort(assault.association.rules, by = "count")
as_by_conf <- sort(assault.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(as_by_count)
inspect(as_by_conf)
#inspect(dp_by_supp)
# Rule GENERATION
cd.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.001, conf=0.1),
appearance = list(default="lhs",rhs="CRIMINAL DAMAGE"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [40 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
cd.subset.rules <- which(colSums(is.subset(cd.association.rules, cd.association.rules)) > 1) # get subset rules in vector
cd.subset.association.rules. <- cd.association.rules[-cd.subset.rules] # remove subset rules.
inspect(cd.association.rules)
cd_by_count <- sort(cd.association.rules, by = "count")
cd_by_conf <- sort(cd.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(cd_by_count)
inspect(cd_by_conf)
#inspect(dp_by_supp)
# Rule GENERATION
dp.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.001, conf=0.1),
appearance = list(default="lhs",rhs="DECEPTIVE PRACTICE"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [19 rule(s)] done [0.00s].
creating S4 object ... done [0.05s].
# Borrar reglas redundantes
dp.subset.rules <- which(colSums(is.subset(dp.association.rules, dp.association.rules)) > 1) # get subset rules in vector
dp.subset.association.rules. <- dp.association.rules[-dp.subset.rules] # remove subset rules.
inspect(dp.subset.association.rules.)
dp_by_count <- sort(dp.subset.association.rules., by = "count")
dp_by_conf <- sort(dp.subset.association.rules., by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(dp_by_count)
inspect(dp_by_conf)
#inspect(dp_by_supp)
narcotics_clean.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.001, conf=0.1),
appearance = list(default="lhs",rhs="NARCOTICS"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [18 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
narcotics_clean.subset.rules <- which(colSums(is.subset(narcotics_clean.association.rules, narcotics_clean.association.rules)) > 1) # get subset rules in vector
narcotics_clean.subset.association.rules. <- narcotics_clean.association.rules[-narcotics_clean.subset.rules] # remove subset rules.
inspect(narcotics_clean.subset.association.rules.)
narc_by_count <- sort(narcotics_clean.association.rules, by = "count")
narc_by_conf <- sort(narcotics_clean.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(narc_by_count)
inspect(narc_by_conf)
#inspect(dp_by_supp)
robbery.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.001, conf=0.15),
appearance = list(default="lhs",rhs="ROBBERY"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [11 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
robbery.subset.rules <- which(colSums(is.subset(robbery.association.rules, robbery.association.rules)) > 1)
robbery.subset.association.rules. <- robbery.association.rules[-robbery.subset.rules] # remove subset rules.
inspect(robbery.association.rules)
rob_by_count <- sort(robbery.association.rules, by = "count")
rob_by_conf <- sort(robbery.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(rob_by_count)
inspect(rob_by_conf)
#inspect(dp_by_supp)
theft.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.005, conf=0.5),
appearance = list(default="lhs",rhs="THEFT"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 972
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [4 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
theft.subset.rules <- which(colSums(is.subset(theft.association.rules, theft.association.rules)) > 1)
theft.subset.association.rules. <- theft.association.rules[-theft.subset.rules] # remove subset rules.
inspect(theft.subset.association.rules.)
theft_by_count <- sort(theft.association.rules, by = "count")
theft_by_conf <- sort(theft.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(theft_by_count)
inspect(theft_by_conf)
#inspect(dp_by_supp)
vc.association.rules <- apriori(crime_transactions, parameter =
list(supp=0.001, conf=0.15),
appearance = list(default="lhs",rhs="VIOLENT CRIME"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [21 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
vc.subset.rules <- which(colSums(is.subset(vc.association.rules, vc.association.rules)) > 1) # get subset rules in
vc.subset.association.rules. <- vc.association.rules[-vc.subset.rules] # remove subset rules.
inspect(vc.subset.association.rules.)
vc_by_count <- sort(vc.association.rules, by = "count")
vc_by_conf <- sort(vc.association.rules, by = "confidence")
#vc_by_supp <- sort(vc.subset.association.rules., by = "support")
inspect(vc_by_count)
inspect(vc_by_conf)
#inspect(wv_by_supp)
wv.association.rules <- apriori(crime_transactions,parameter =
list(supp=0.001, conf=0.1),
appearance = list(default="lhs",rhs="WEAPONS VIOLATION"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [8 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
wv.subset.rules <- which(colSums(is.subset(wv.association.rules, wv.association.rules)) > 1) # get subset rules in
wv.subset.association.rules. <- wv.association.rules[-wv.subset.rules] # remove subset rules.
inspect(wv.subset.association.rules.)
wv_by_count <- sort(wv.association.rules, by = "count")
wv_by_conf <- sort(wv.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(wv_by_count)
inspect(wv_by_conf)
#inspect(wv_by_supp)
true.association.rules <- apriori(crime_transactions,parameter =
list(supp=0.001, conf=0.5),
appearance = list(default="lhs",rhs="True"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [24 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
true.subset.rules <- which(colSums(is.subset(true.association.rules, true.association.rules)) > 1) # get subset rules in
true.subset.association.rules. <- true.association.rules[-true.subset.rules] # remove subset rules.
inspect(true.subset.association.rules.)
t_by_count <- sort(true.subset.association.rules., by = "count")
t_by_conf <- sort(true.subset.association.rules., by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(t_by_count)
inspect(t_by_conf)
#inspect(wv_by_supp)
false.association.rules <- apriori(crime_transactions,parameter =
list(supp=0.001, conf=0.8),
appearance = list(default="lhs",rhs="False"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 194
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [125 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
false.subset.rules <- which(colSums(is.subset(false.association.rules, false.association.rules)) > 1) # get subset rules in
false.subset.association.rules. <- false.association.rules[-false.subset.rules] # remove subset rules.
inspect(false.subset.association.rules.)
f_by_count <- sort(false.association.rules, by = "count")
f_by_conf <- sort(false.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(f_by_count)
inspect(f_by_conf)
#inspect(wv_by_supp)
ocho.association.rules <- apriori(crime_transactions,parameter =
list(supp=0.0001, conf=0.01),
appearance = list(default="lhs",rhs="8"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 19
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [26 rule(s)] done [0.00s].
creating S4 object ... done [0.02s].
# Borrar reglas redundantes
ocho.subset.rules <- which(colSums(is.subset(ocho.association.rules, ocho.association.rules)) > 1) # get subset rules in
ocho.subset.association.rules. <- ocho.association.rules[-ocho.subset.rules] # remove subset rules.
inspect(ocho.subset.association.rules.)
ocho_by_count <- sort(ocho.association.rules, by = "count")
ocho_by_conf <- sort(ocho.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(ocho_by_count)
inspect(ocho_by_conf)
#inspect(wv_by_supp)
## GRAFICOS
## Dataset Entero
library(arulesViz)
Loading required package: grid
Registered S3 method overwritten by 'seriation':
method from
reorder.hclust gclus
# Filter rules with confidence greater than 0.4 or 40%
subRules<-association.rules.clean[quality(association.rules.clean)$confidence>0.7]
#Plot SubRules
plot(subRules,method="two-key plot")

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(subRules, n = 25, by = "confidence")
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(top10subRules, method = "graph", engine = "htmlwidget")
## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules, n=25, by="confidence")
plot(subRules2, method="paracoord")

#Plot SubRules
plot(assault.subset.association.rules.,method="two-key plot")

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(assault.subset.association.rules., n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(assault.subset.association.rules., method = "graph", engine = "htmlwidget")
## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(assault.subset.association.rules., n=20, by="confidence")
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(cd.association.rules,method="two-key plot")

subRules_cd<-cd.association.rules[quality(cd.association.rules)$confidence>0.2]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(cd.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_cd, method = "graph", engine = "htmlwidget")
## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_cd, n=25, by="confidence")
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(dp.association.rules,method="two-key plot")

subRules_dp<-dp.association.rules[quality(dp.association.rules)$confidence>0.1]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(dp.association.rules, n = 10, by = "count")
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_dp, method = "graph", engine = "htmlwidget")
## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_dp, n=25, by="count")
plot(subRules_dp, method="paracoord")

#Plot SubRules
plot(narcotics_clean.association.rules,method="two-key plot")

subRules_narcotics<-narcotics_clean.association.rules[quality(narcotics_clean.association.rules)$confidence>0.6]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(narcotics_clean.association.rules, n = 10, by = "confidence")
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_narcotics, method = "graph", engine = "htmlwidget")
## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_narcotics, n=25, by="confidence")
plot(subRules_narcotics, method="paracoord")

#Plot SubRules
plot(robbery.association.rules,method="two-key plot")

subRules_robbery<-robbery.association.rules[quality(robbery.association.rules)$confidence>0.15]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(robbery.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_robbery, method = "graph", engine = "htmlwidget")
## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_robbery, n=25, by="confidence")
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(theft.association.rules,method="two-key plot")

subRules_theft<-theft.association.rules[quality(theft.association.rules)$confidence>0.45]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(theft.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_theft, method = "graph", engine = "htmlwidget")
## Individual Rule Representation
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(vc.association.rules,method="two-key plot")

subRules_vc<-vc.association.rules[quality(vc.association.rules)$confidence>0.15]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(vc.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_vc, method = "graph", engine = "htmlwidget")
## Individual Rule Representation
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(wv.association.rules,method="two-key plot")

subRules_wv<-wv.association.rules[quality(wv.association.rules)$confidence>0.1]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(wv.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_wv, method = "graph", engine = "htmlwidget")
## Individual Rule Representation
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(ocho.association.rules,method="two-key plot")

subRules_8<-ocho.association.rules[quality(ocho.association.rules)$confidence>0.01]
## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(ocho.association.rules, n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_8, method = "graph", engine = "htmlwidget")
## Individual Rule Representation
plot(top10subRules, method="paracoord")

Mapas de Densidad
## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
# library("maptools")
library("KernSmooth")
library(viridis)
Loading required package: viridisLite
library(RColorBrewer)
assault <- na.omit(assault)
setDT(assault)
criminal_damage <- na.omit(criminal_damage)
setDT(criminal_damage)
deceptive_practice <- na.omit(deceptive_practice)
setDT(deceptive_practice)
narcotics <- na.omit(narcotics)
setDT(narcotics)
robbery <- na.omit(robbery)
setDT(robbery)
theft <- na.omit(theft)
setDT(theft)
violent_crime <- na.omit(violent_crime)
setDT(violent_crime)
weapons_violation <- na.omit(weapons_violation)
setDT(weapons_violation)
## MAKE CONTOUR LINES
## Assault
kde_assault <- bkde2D(assault[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_assault <- contourLines(kde_assault$x1 , kde_assault$x2 , kde_assault$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_assault<- as.factor(sapply(CL_assault, `[[`, "level"))
NLEV_assault <- length(levels(LEVS_assault))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_assault <- lapply(1:length(CL_assault), function(i)
Polygons(list(Polygon(cbind(CL_assault[[i]]$x, CL_assault[[i]]$y))), ID=i))
spgons_assault = SpatialPolygons(pgons_assault)
## Criminal Damage
kde_cd <- bkde2D(criminal_damage[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_cd <- contourLines(kde_cd$x1 , kde_cd$x2 , kde_cd$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_cd<- as.factor(sapply(CL_cd, `[[`, "level"))
NLEV_cd <- length(levels(LEVS_cd))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_cd <- lapply(1:length(CL_cd), function(i)
Polygons(list(Polygon(cbind(CL_cd[[i]]$x, CL_cd[[i]]$y))), ID=i))
less than 4 coordinates in polygonless than 4 coordinates in polygon
spgons_cd = SpatialPolygons(pgons_cd)
## Deceptive Practice
kde_dp <- bkde2D(deceptive_practice[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_dp <- contourLines(kde_dp$x1 , kde_dp$x2 , kde_dp$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_dp<- as.factor(sapply(CL_dp, `[[`, "level"))
NLEV_dp <- length(levels(LEVS_dp))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_dp <- lapply(1:length(CL_dp), function(i)
Polygons(list(Polygon(cbind(CL_dp[[i]]$x, CL_dp[[i]]$y))), ID=i))
spgons_dp = SpatialPolygons(pgons_dp)
## Narcotics
kde_narcotics <- bkde2D(narcotics[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_narcotics <- contourLines(kde_narcotics$x1 , kde_narcotics$x2 , kde_narcotics$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_narcotics <- as.factor(sapply(CL_narcotics, `[[`, "level"))
NLEV_narcotics <- length(levels(LEVS_narcotics))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_narcotics <- lapply(1:length(CL_narcotics), function(i)
Polygons(list(Polygon(cbind(CL_narcotics[[i]]$x, CL_narcotics[[i]]$y))), ID=i))
spgons_narcotics = SpatialPolygons(pgons_narcotics)
## Robbery
kde_robbery <- bkde2D(robbery[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_robbery <- contourLines(kde_robbery$x1 , kde_robbery$x2 , kde_robbery$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_robbery <- as.factor(sapply(CL_robbery, `[[`, "level"))
NLEV_robbery <- length(levels(LEVS_robbery))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_robbery <- lapply(1:length(CL_robbery), function(i)
Polygons(list(Polygon(cbind(CL_robbery[[i]]$x, CL_robbery[[i]]$y))), ID=i))
spgons_robbery = SpatialPolygons(pgons_robbery)
## Thefts
kde_theft <- bkde2D(theft[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_theft <- contourLines(kde_theft$x1 , kde_theft$x2 , kde_theft$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_theft <- as.factor(sapply(CL_theft, `[[`, "level"))
NLEV_theft <- length(levels(LEVS_theft))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_theft <- lapply(1:length(CL_theft), function(i)
Polygons(list(Polygon(cbind(CL_theft[[i]]$x, CL_theft[[i]]$y))), ID=i))
spgons_theft = SpatialPolygons(pgons_theft)
## Violent Crimws
kde_vc <- bkde2D(violent_crime[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_vc <- contourLines(kde_vc$x1 , kde_vc$x2 , kde_vc$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_vc <- as.factor(sapply(CL_vc, `[[`, "level"))
NLEV_vc <- length(levels(LEVS_vc))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_vc <- lapply(1:length(CL_vc), function(i)
Polygons(list(Polygon(cbind(CL_vc[[i]]$x, CL_vc[[i]]$y))), ID=i))
spgons_vc = SpatialPolygons(pgons_vc)
## Weapons Violation
kde_wv <- bkde2D(weapons_violation[ , list(longitude, latitude)],
bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_wv <- contourLines(kde_wv$x1 , kde_wv$x2 , kde_wv$fhat)
## EXTRACT CONTOUR LINE LEVELS
LEVS_wv <- as.factor(sapply(CL_wv, `[[`, "level"))
NLEV_wv <- length(levels(LEVS_wv))
## CONVERT CONTOUR LINES TO POLYGONS
pgons_wv <- lapply(1:length(CL_wv), function(i)
Polygons(list(Polygon(cbind(CL_wv[[i]]$x, CL_wv[[i]]$y))), ID=i))
less than 4 coordinates in polygon
spgons_wv = SpatialPolygons(pgons_wv)
leaflet() %>% addTiles() %>%
addPolygons(data = spgons_narcotics, color = brewer.pal(NLEV_narcotics, name = "YlOrRd")[LEVS_narcotics], group = "Narcotics") %>%
addPolygons(data = spgons_assault, color = brewer.pal(NLEV_assault, name = "Reds")[LEVS_assault], group = "Assault") %>%
addPolygons(data = spgons_cd, color = brewer.pal(NLEV_cd, name="YlGnBu")[LEVS_cd], group = "Criminal Damage") %>%
addPolygons(data = spgons_dp, color = brewer.pal(NLEV_dp, name = "YlGn")[LEVS_dp], group = "Deceptive Practice") %>%
addPolygons(data = spgons_robbery, color = brewer.pal(NLEV_robbery, name = "Purples")[LEVS_robbery], group = "Robbery") %>%
addPolygons(data = spgons_theft, color = brewer.pal(NLEV_theft, name = "Oranges")[LEVS_theft], group = "Thefts") %>%
addPolygons(data = spgons_vc, color = brewer.pal(NLEV_vc, name = "Greys")[LEVS_vc], group = "Violent Crimes") %>%
addPolygons(data = spgons_wv, color = brewer.pal(NLEV_wv, name = "Blues")[LEVS_wv], group = "Weapons Violation") %>%
addLabelOnlyMarkers(districts$longitude, districts$latitude, label = districts$district,
labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T), group = "Districts") %>%
addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))
n too large, allowed maximum for palette YlOrRd is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette Reds is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette YlGnBu is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette YlGn is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette Greys is 9
Returning the palette you asked for with that many colors
#addCircles(lng = narcotics$longitude, lat = narcotics$latitude,radius = .1, opacity = .4, col = "blue", group = "Points") %>%
#leaflet() %>% addTiles() %>%
# addCircles(lng = weapons_violation$longitude, lat = weapons_violation$latitude,radius = .05, opacity = 0.1, col = brewer.pal(10,name = "Reds"), group = "Narcotics") %>%
# addLabelOnlyMarkers(districts$longitude, districts$latitude, label = districts$district,
# labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T, textsize = "15px"), group = #"Districts") %>%
# addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))
Clustering
chicago_crime_clustering <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
unique(chicago_crime_clustering$primary_type)
[1] ASSAULT NARCOTICS DECEPTIVE PRACTICE ROBBERY WEAPONS VIOLATION THEFT VIOLENT CRIME
[8] CRIMINAL DAMAGE
Levels: ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))
types <- unique(chicago_crime_clustering$primary_type)
chicago_crime_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
chicago_crime_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#chicago_crime_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(chicago_crime_clustering$location_description))
chicago_crime_clustering$district <- as.numeric(chicago_crime_clustering$district)
test <- chicago_crime_clustering
#Normalization of variables
library(RSNNS)
train_set <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)
#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,]
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]
#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]
## Dendograms
library(tidyverse) #data manipulation and visualization
library(class) # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")


distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
main = "Agglomerative, complete linkages")

library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
"wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
stats.names[i] <- paste("Test", i-1)
for(j in seq_along(clust.assess)){
output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
}
for(d in 1:k) {
cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
cluster.sizes[d, i]
}
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive
stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl
#confusionMatrix(train_small, )
library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))
#narcotics <- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#narcotics_tr <- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
narcotics_clustering <- subset(narcotics, select=-c(location_description))
narcotics_clustering_tr <- subset(narcotics_tr, select=-c(location_description))
types <- unique(chicago_crime_clustering$primary_type)
narcotics_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
narcotics_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#narcotics_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(narcotics_clustering$location_description))
narcotics_clustering$district <- as.numeric(narcotics_clustering$district)
test <- narcotics_clustering
#Normalization of variables
library(RSNNS)
train_set <- narcotics_clustering_tr
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)
#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,]
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]
#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]
## Dendograms
library(tidyverse) #data manipulation and visualization
library(class) # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")


distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
main = "Agglomerative, complete linkages")

library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
"wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
stats.names[i] <- paste("Test", i-1)
for(j in seq_along(clust.assess)){
output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
}
for(d in 1:k) {
cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
cluster.sizes[d, i]
}
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive
stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl
#confusionMatrix(train_small, )
library("ggplot2")
library("reshape2")
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
The following objects are masked from ‘package:data.table’:
dcast, melt
library("purrr")
library("dplyr")
# let's start with a dendrogram
library("dendextend")
---------------------
Welcome to dendextend version 1.13.4
Type citation('dendextend') for how to cite the package.
Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/
Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
Or contact: <tal.galili@gmail.com>
To suppress this message use: suppressPackageStartupMessages(library(dendextend))
---------------------
Attaching package: ‘dendextend’
The following object is masked from ‘package:data.table’:
set
The following object is masked from ‘package:stats’:
cutree
dendro <- as.dendrogram(agg)
dendro.col <- dendro %>%
set("branches_k_color", k = 8, value = c("darkslategray", "darkslategray4", "darkslategray3", "gold3", "darkcyan", "cyan3", "gold3")) %>%
set("branches_lwd", 0.6) %>%
set("labels_colors",
value = c("darkslategray")) %>%
set("labels_cex", 0.5)
Length of color vector was shorter than the number of clusters - color vector was recycled
ggd1 <- as.ggdend(dendro.col)
ggplot(ggd1, theme = theme_minimal()) +
labs(x = "Num. observations", y = "Height", title = "Dendrogram, k = 8")

NA
NA
NA
Arboles de Decision
## c50
library(dplyr)
library(MASS) # for obtaining data
library(tidyverse) # for data processing
library(rpart) # for CART decision tree
library(rpart.plot) # for plotting CART
library(caret) # for confusion matrix and more
library(rsample) # for data splitting
library(data.table)
library(C50)
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
chicago_crime_trees <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_trees_tr <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
library(dplyr)
chicago_crime_trees %>% mutate_if(is.factor, as.character) -> chicago_crime_trees
chicago_crime_trees_tr %>% mutate_if(is.factor, as.character) -> chicago_crime_trees_tr
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "THEFT"] <- "TH"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "THEFT"] <- "TH"
#train_c50<- chicago_crime_trees_tr
#test_c50<- chicago_crime_trees
crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)
#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$primary_type <- as.factor(train_c50$primary_type)
test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(primary_type ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.2)) #Higher CF less prunning
summary(tree_result)
Call:
C5.0.formula(formula = primary_type ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.2))
C5.0 [Release 2.07 GPL Edition] Fri May 8 13:49:43 2020
-------------------------------
Class specified by attribute `outcome'
Read 488358 cases (3 attributes) from undefined.data
Decision tree:
arrest = False: THEFT (396262/242319)
arrest = True:
:...district in {1,2,12,14,16,17,18,19,20,24,31}: THEFT (29329/19863)
district in {3,4,5,6,7,8,9,10,11,15,22,25}: NARCOTICS (62767/36815)
Evaluation on training data (488358 cases):
Decision Tree
----------------
Size Errors
3 298997(61.2%) <<
(a) (b) (c) (d) (e) (f) (g) (h) <-classified as
----- ----- ----- ----- ----- ----- ----- -----
4197 30630 (a): class ASSAULT
1976 60313 (b): class CRIMINAL DAMAGE
970 41627 (c): class DECEPTIVE PRACTICE
25952 4001 (d): class NARCOTICS
7181 65181 (e): class ROBBERY
7604 163409 (f): class THEFT
7881 56651 (g): class VIOLENT CRIME
7006 3779 (h): class WEAPONS VIOLATION
Attribute usage:
100.00% arrest
18.86% district
Time: 0.2 secs
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$primary_type)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$primary_type),
"correct classified cases from", length(predictions))
[1] "The classification error in test set is: 61.0087722890678 % 47604 correct classified cases from 122089"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$primary_type)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$primary_type),
"correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 61.2249620155705 % 189361 correct classified cases from 488358"
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.1)) #Higher CF less prunning
summary(tree_result)
Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.1))
C5.0 [Release 2.07 GPL Edition] Fri May 8 13:47:32 2020
-------------------------------
Class specified by attribute `outcome'
Read 488358 cases (4 attributes) from undefined.data
Decision tree:
primary_type in {NARCOTICS,WEAPONS VIOLATION}:
:...primary_type = WEAPONS VIOLATION:
: :...arrest = False:
: : :...location_description <= 18:
: : : :...location_description <= 14: 7 (9/6)
: : : : location_description > 14: 25 (400/355)
: : : location_description > 18:
: : : :...location_description <= 19: 11 (210/167)
: : : location_description > 19:
: : : :...location_description > 128: 11 (1516/1354)
: : : location_description <= 128:
: : : :...location_description > 110: 7 (534/464)
: : : location_description <= 110:
: : : :...location_description <= 64: 5 (50/39)
: : : location_description > 64: 4 (150/131)
: : arrest = True:
: : :...location_description > 113:
: : :...location_description <= 135:
: : : :...location_description <= 121: 7 (1064/902)
: : : : location_description > 121: 10 (1567/1339)
: : : location_description > 135:
: : : :...location_description <= 141: 7 (3228/2755)
: : : location_description > 141:
: : : :...location_description <= 144: 10 (72/51)
: : : location_description > 144: 11 (404/314)
: : location_description <= 113:
: : :...location_description <= 34:
: : :...location_description <= 15:
: : : :...location_description <= 4: 7 (25/16)
: : : : location_description > 4: 16 (13)
: : : location_description > 15:
: : : :...location_description <= 17: 7 (466/413)
: : : location_description > 17:
: : : :...location_description <= 19: 11 (343/293)
: : : location_description > 19:
: : : :...location_description <= 30: 15 (23/18)
: : : location_description > 30: 16 (7/4)
: : location_description > 34:
: : :...location_description <= 44: 5 (76/53)
: : location_description > 44:
: : :...location_description > 105:
: : :...location_description <= 107: 4 (90/79)
: : : location_description > 107: 6 (206/174)
: : location_description <= 105:
: : :...location_description <= 73:
: : :...location_description <= 55: 3 (41/35)
: : : location_description > 55: 1 (42/31)
: : location_description > 73:
: : :...location_description <= 77: 6 (134/104)
: : location_description > 77:
: : :...location_description <= 101: 5 (45/39)
: : location_description > 101:
: : :...location_description <= 103: 11 (63/49)
: : location_description > 103: 1 (7/4)
: primary_type = NARCOTICS:
: :...location_description > 110:
: :...location_description > 133: 11 (10740/8406)
: : location_description <= 133:
: : :...location_description > 131: 11 (8624/4943)
: : location_description <= 131:
: : :...location_description <= 113: 11 (827/288)
: : location_description > 113:
: : :...location_description <= 121:
: : :...location_description <= 116: 7 (1519/1223)
: : : location_description > 116: 11 (996/597)
: : location_description > 121:
: : :...location_description <= 127:
: : :...location_description <= 124: 11 (110/88)
: : : location_description > 124: 18 (52/45)
: : location_description > 127:
: : :...location_description > 129: 10 (82/69)
: : location_description <= 129:
: : :...location_description <= 128: 8 (17/14)
: : location_description > 128: 25 (369/327)
: location_description <= 110:
: :...location_description <= 19:
: :...location_description > 16: 11 (3029/2243)
: : location_description <= 16:
: : :...location_description <= 2: 11 (230/129)
: : location_description > 2: 16 (128/33)
: location_description > 19:
: :...location_description > 107: 11 (821/674)
: location_description <= 107:
: :...location_description > 77:
: :...location_description <= 79: 16 (217/44)
: : location_description > 79:
: : :...location_description > 101:
: : :...location_description <= 103: 11 (347/240)
: : : location_description > 103:
: : : :...location_description <= 105: 1 (70/4)
: : : location_description > 105: 10 (298/226)
: : location_description <= 101:
: : :...location_description <= 87: 7 (201/165)
: : location_description > 87:
: : :...location_description <= 90: 18 (31/16)
: : location_description > 90:
: : :...location_description > 98: 15 (39/31)
: : location_description <= 98:
: : :...location_description <= 92: 10 (41/28)
: : location_description > 92: 12 (59/49)
: location_description <= 77:
: :...location_description > 73: 11 (428/308)
: location_description <= 73:
: :...location_description > 62: 25 (119/106)
: location_description <= 62:
: :...location_description > 54:
: :...location_description > 59: 1 (54/31)
: : location_description <= 59:
: : :...location_description <= 57: 1 (66/49)
: : location_description > 57: 6 (83/50)
: location_description <= 54:
: :...location_description > 50: 11 (128/95)
: location_description <= 50:
: :...location_description > 44:
: :...location_description <= 48: 18 (20/9)
: : location_description > 48: 1 (13/10)
: location_description <= 44:
: :...location_description <= 27: [S1]
: location_description > 27: [S2]
primary_type in {ASSAULT,CRIMINAL DAMAGE,ROBBERY,VIOLENT CRIME}:
:...location_description <= 19:
: :...location_description <= 16:
: : :...location_description <= 2: 7 (379/308)
: : : location_description > 2: 16 (777/111)
: : location_description > 16:
: : :...location_description > 18: 3 (28087/24753)
: : location_description <= 18:
: : :...location_description > 17: 18 (39/34)
: : location_description <= 17:
: : :...primary_type in {CRIMINAL DAMAGE,
: : : VIOLENT CRIME}: 11 (2839/2595)
: : primary_type = ASSAULT:
: : :...arrest = False: 25 (785/709)
: : : arrest = True: 11 (179/162)
: : primary_type = ROBBERY:
: : :...arrest = False: 11 (1916/1722)
: : arrest = True: 19 (159/142)
: location_description > 19:
: :...location_description <= 109:
: :...location_description <= 40:
: : :...location_description > 34:
: : : :...location_description > 38: 5 (868/648)
: : : : location_description <= 38:
: : : : :...location_description <= 36: 5 (742/599)
: : : : location_description > 36:
: : : : :...primary_type = CRIMINAL DAMAGE: 19 (7/5)
: : : : primary_type in {ROBBERY,
: : : : : VIOLENT CRIME}: 1 (193/124)
: : : : primary_type = ASSAULT:
: : : : :...arrest = False: 2 (82/53)
: : : : arrest = True: 1 (22/15)
: : : location_description <= 34:
: : : :...location_description <= 25:
: : : :...primary_type in {ASSAULT,CRIMINAL DAMAGE,ROBBERY}:
: : : : :...location_description <= 20: 9 (49/42)
: : : : : location_description > 20: 1 (602/506)
: : : : primary_type = VIOLENT CRIME:
: : : : :...location_description <= 21:
: : : : :...location_description <= 20: 6 (20/16)
: : : : : location_description > 20: 1 (46/35)
: : : : location_description > 21:
: : : : :...location_description <= 24: 8 (198/173)
: : : : location_description > 24: 1 (24/17)
: : : location_description > 25:
: : : :...location_description <= 26:
: : : :...primary_type in {ASSAULT,CRIMINAL DAMAGE,
: : : : : ROBBERY}: 19 (470/369)
: : : : primary_type = VIOLENT CRIME: 18 (1234/866)
: : : location_description > 26:
: : : :...location_description > 32: 17 (176/153)
: : : location_description <= 32:
: : : :...location_description <= 29: 6 (314/281)
: : : location_description > 29:
: : : :...location_description <= 30: 1 (30/12)
: : : location_description > 30: [S3]
: : location_description > 40:
: : :...location_description <= 79:
: : :...location_description > 73:
: : : :...location_description > 77: 1 (538/424)
: : : : location_description <= 77:
: : : : :...primary_type = CRIMINAL DAMAGE: 6 (557/488)
: : : : primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
: : : : :...primary_type = ASSAULT: 6 (798/697)
: : : : primary_type = ROBBERY: 11 (1721/1425)
: : : : primary_type = VIOLENT CRIME:
: : : : :...location_description <= 75: 10 (7/4)
: : : : location_description > 75: 11 (850/747)
: : : location_description <= 73:
: : : :...primary_type = CRIMINAL DAMAGE:
: : : :...location_description <= 54:
: : : : :...location_description <= 44:
: : : : : :...arrest = False: 2 (286/255)
: : : : : : arrest = True: 7 (9/6)
: : : : : location_description > 44:
: : : : : :...location_description <= 50:
: : : : : :...location_description <= 46: 4 (45/39)
: : : : : : location_description > 46:
: : : : : : :...location_description <= 47: 1 (21/15)
: : : : : : location_description > 47: 12 (627/559)
: : : : : location_description > 50:
: : : : : :...location_description <= 52:
: : : : : :...location_description <= 51: 1 (225/203)
: : : : : : location_description > 51: 6 (7/4)
: : : : : location_description > 52:
: : : : : :...arrest = True: 9 (32/27)
: : : : : arrest = False: [S4]
: : : : location_description > 54:
: : : : :...location_description <= 60:
: : : : :...arrest = False: 24 (810/591)
: : : : : arrest = True: 1 (74/53)
: : : : location_description > 60:
: : : : :...arrest = True: 3 (44/38)
: : : : arrest = False:
: : : : :...location_description > 64: 8 (540/468)
: : : : location_description <= 64:
: : : : :...location_description > 62: 18 (110/89)
: : : : location_description <= 62: [S5]
: : : primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
: : : :...location_description <= 54:
: : : :...location_description > 50:
: : : : :...location_description > 52:
: : : : : :...location_description <= 53: 6 (1016/913)
: : : : : : location_description > 53: 11 (653/584)
: : : : : location_description <= 52: [S6]
: : : : location_description <= 50:
: : : : :...location_description > 46:
: : : : :...location_description <= 48: 1 (140/98)
: : : : : location_description > 48:
: : : : : :...location_description <= 49: 1 (1110/952)
: : : : : location_description > 49: 12 (336/275)
: : : : location_description <= 46:
: : : : :...arrest = True:
: : : : :...location_description <= 44: 18 (87/77)
: : : : : location_description > 44: 5 (39/31)
: : : : arrest = False:
: : : : :...location_description > 43: 8 (91/78)
: : : : location_description <= 43: [S7]
: : : location_description > 54:
: : : :...location_description <= 60:
: : : :...primary_type in {ASSAULT,
: : : : : VIOLENT CRIME}: 1 (1631/1245)
: : : : primary_type = ROBBERY:
: : : : :...arrest = False: 1 (700/554)
: : : : arrest = True:
: : : : :...location_description <= 57: 1 (122/98)
: : : : location_description > 57: [S8]
: : : location_description > 60:
: : : :...location_description <= 64:
: : : :...location_description <= 62: 3 (230/207)
: : : : location_description > 62: 1 (956/765)
: : : location_description > 64:
: : : :...location_description > 67:
: : : :...location_description <= 68: [S9]
: : : : location_description > 68: [S10]
: : : location_description <= 67:
: : : :...location_description > 65: [S11]
: : : location_description <= 65:
: : : :...arrest = False: 16 (196/171)
: : : arrest = True: [S12]
: : location_description > 79:
: : :...location_description <= 89:
: : :...location_description > 86:
: : : :...arrest = False: 18 (613/430)
: : : : arrest = True: 1 (348/212)
: : : location_description <= 86:
: : : :...location_description > 84:
: : : :...arrest = False: 12 (782/657)
: : : : arrest = True: 18 (384/294)
: : : location_description <= 84:
: : : :...location_description > 81: 3 (55/49)
: : : location_description <= 81:
: : : :...primary_type = ASSAULT: 18 (425/389)
: : : primary_type = CRIMINAL DAMAGE: 6 (225/205)
: : : primary_type in {ROBBERY,
: : : VIOLENT CRIME}: 19 (1236/1138)
: : location_description > 89:
: : :...location_description > 100:
: : :...arrest = True:
: : : :...location_description <= 106: 1 (1848/1539)
: : : : location_description > 106: 6 (1264/1133)
: : : arrest = False:
: : : :...primary_type = CRIMINAL DAMAGE:
: : : :...location_description > 107: 1 (3235/2863)
: : : : location_description <= 107:
: : : : :...location_description <= 105: 8 (1909/1754)
: : : : location_description > 105: 19 (257/231)
: : : primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
: : : :...location_description <= 104:
: : : :...location_description > 103: 1 (54/35)
: : : : location_description <= 103: [S13]
: : : location_description > 104:
: : : :...location_description > 108: 8 (2901/2669)
: : : location_description <= 108:
: : : :...location_description <= 105: 1 (63/52)
: : : location_description > 105: [S14]
: : location_description <= 100:
: : :...location_description > 98:
: : :...primary_type = CRIMINAL DAMAGE: 2 (40/34)
: : : primary_type = ROBBERY: 20 (63/56)
: : : primary_type = VIOLENT CRIME: 15 (719/607)
: : : primary_type = ASSAULT:
: : : :...arrest = False: 2 (185/161)
: : : arrest = True: 15 (16/12)
: : location_description <= 98:
: : :...location_description > 96: 18 (121/99)
: : location_description <= 96:
: : :...location_description <= 92:
: : :...location_description <= 90: 5 (54/40)
: : : location_description > 90: 12 (94/73)
: : location_description > 92:
: : :...arrest = True:
: : :...location_description <= 94: 1 (145/71)
: : : location_description > 94: 14 (24/20)
: : arrest = False:
: : :...location_description <= 93: 19 (37/27)
: : location_description > 93: [S15]
: location_description > 109:
: :...location_description <= 121:
: :...location_description > 119: 8 (13730/11829)
: : location_description <= 119:
: : :...location_description > 116:
: : :...primary_type in {ASSAULT,VIOLENT CRIME}:
: : : :...arrest = False: 11 (2154/1959)
: : : : arrest = True: 7 (449/403)
: : : primary_type in {CRIMINAL DAMAGE,ROBBERY}:
: : : :...arrest = False:
: : : :...primary_type = CRIMINAL DAMAGE: 19 (667/614)
: : : : primary_type = ROBBERY: 2 (616/568)
: : : arrest = True:
: : : :...primary_type = CRIMINAL DAMAGE: 19 (59/50)
: : : primary_type = ROBBERY: 24 (413/370)
: : location_description <= 116:
: : :...location_description <= 111:
: : :...location_description <= 110: 9 (45/37)
: : : location_description > 110:
: : : :...primary_type = ASSAULT: 19 (84/75)
: : : primary_type = VIOLENT CRIME: 11 (158/140)
: : : primary_type = ROBBERY: 3 (181/140)
: : : primary_type = CRIMINAL DAMAGE:
: : : :...arrest = False: 11 (67/57)
: : : arrest = True: 10 (61/51)
: : location_description > 111:
: : :...primary_type = ROBBERY:
: : :...arrest = False: 8 (11026/9771)
: : : arrest = True: 5 (963/871)
: : primary_type in {ASSAULT,CRIMINAL DAMAGE,
: : : VIOLENT CRIME}:
: : :...primary_type in {ASSAULT,
: : : VIOLENT CRIME}: 7 (9802/8729)
: : primary_type = CRIMINAL DAMAGE:
: : :...arrest = False: 8 (10185/9121)
: : arrest = True: 4 (418/370)
: location_description > 121:
: :...location_description <= 122:
: :...primary_type in {ASSAULT,CRIMINAL DAMAGE,
: : : ROBBERY}: 1 (4301/3863)
: : primary_type = VIOLENT CRIME: 18 (1480/1277)
: location_description > 122:
: :...location_description <= 130:
: :...location_description > 128: 8 (7069/6410)
: : location_description <= 128:
: : :...location_description <= 126: 25 (20/16)
: : location_description > 126:
: : :...primary_type in {ASSAULT,
: : : VIOLENT CRIME}: 11 (750/668)
: : primary_type = ROBBERY:
: : :...arrest = False: 12 (78/68)
: : : arrest = True: 24 (90/73)
: : primary_type = CRIMINAL DAMAGE:
: : :...arrest = True: 9 (13/10)
: : arrest = False:
: : :...location_description <= 127: 8 (87/77)
: : location_description > 127: 15 (44/38)
: location_description > 130:
: :...primary_type = CRIMINAL DAMAGE:
: :...location_description > 134: 8 (24708/22873)
: : location_description <= 134:
: : :...location_description <= 133:
: : :...arrest = False: 19 (368/337)
: : : arrest = True: 10 (168/150)
: : location_description > 133:
: : :...arrest = False: 18 (648/598)
: : arrest = True: 8 (60/50)
: primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
: :...location_description <= 136:
: :...location_description <= 133:
: : :...arrest = False: 11 (21858/19830)
: : : arrest = True: [S16]
: : location_description > 133:
: : :...location_description > 134: 19 (202/132)
: : location_description <= 134: [S17]
: location_description > 136:
: :...location_description > 151:
: :...location_description > 152: 7 (45/37)
: : location_description <= 152: [S18]
: location_description <= 151:
: :...primary_type = ASSAULT:
: :...location_description > 146:
: : :...arrest = False: 10 (138/125)
: : : arrest = True: 15 (27/22)
: : location_description <= 146:
: : :...location_description <= 139: [S19]
: : location_description > 139: [S20]
: primary_type in {ROBBERY,VIOLENT CRIME}:
: :...location_description <= 140:
: :...location_description <= 137: 11 (21072/19062)
: : location_description > 137: [S21]
: location_description > 140:
: :...location_description <= 144: [S22]
: location_description > 144: [S23]
primary_type in {DECEPTIVE PRACTICE,THEFT}:
:...location_description > 136:
:...location_description > 150:
: :...arrest = False: 9 (251/215)
: : arrest = True: 1 (36/28)
: location_description <= 150:
: :...primary_type = DECEPTIVE PRACTICE:
: :...arrest = False:
: : :...location_description <= 138: 1 (1926/1643)
: : : location_description > 138: 18 (668/565)
: : arrest = True:
: : :...location_description <= 138: 7 (338/306)
: : location_description > 138:
: : :...location_description > 141: 3 (30/24)
: : location_description <= 141:
: : :...location_description <= 139: 18 (6/3)
: : location_description > 139: 19 (44/35)
: primary_type = THEFT:
: :...arrest = True:
: :...location_description <= 138: 8 (1922/1757)
: : location_description > 138:
: : :...location_description > 144: 22 (118/101)
: : location_description <= 144:
: : :...location_description <= 139: 4 (66/54)
: : location_description > 139:
: : :...location_description <= 141: 19 (18/11)
: : location_description > 141: 11 (8/5)
: arrest = False:
: :...location_description <= 138: 12 (53034/48284)
: location_description > 138:
: :...location_description <= 141: 18 (552/404)
: location_description > 141:
: :...location_description <= 143: 7 (365/302)
: location_description > 143:
: :...location_description <= 146: 1 (174/142)
: location_description > 146:
: :...location_description <= 147: 14 (4302/3949)
: location_description > 147: 1 (291/254)
location_description <= 136:
:...location_description > 121:
:...location_description <= 124:
: :...arrest = False: 1 (8654/5129)
: : arrest = True: 18 (304/235)
: location_description > 124:
: :...location_description <= 131:
: :...location_description > 128: 8 (1795/1585)
: : location_description <= 128:
: : :...arrest = True: 24 (22/16)
: : arrest = False:
: : :...location_description <= 127: 1 (319/268)
: : location_description > 127: 12 (88/74)
: location_description > 131:
: :...location_description <= 133:
: :...arrest = False: 1 (5383/4398)
: : arrest = True:
: : :...primary_type = DECEPTIVE PRACTICE: 10 (168/136)
: : primary_type = THEFT: 18 (389/334)
: location_description > 133:
: :...location_description > 134: 19 (333/268)
: location_description <= 134:
: :...primary_type = DECEPTIVE PRACTICE: 18 (1467/1116)
: primary_type = THEFT: 1 (9510/7853)
location_description <= 121:
:...location_description > 109:
:...location_description > 117:
: :...location_description <= 120: 19 (5973/5206)
: : location_description > 120:
: : :...primary_type = THEFT: 8 (4387/4014)
: : primary_type = DECEPTIVE PRACTICE:
: : :...arrest = False: 16 (42/37)
: : arrest = True: 19 (10/6)
: location_description <= 117:
: :...location_description > 111:
: :...primary_type = DECEPTIVE PRACTICE: 8 (13541/12344)
: : primary_type = THEFT: 5 (10708/9898)
: location_description <= 111:
: :...location_description <= 110: 10 (65/54)
: location_description > 110:
: :...primary_type = THEFT: 11 (86/70)
: primary_type = DECEPTIVE PRACTICE:
: :...arrest = False: 9 (59/45)
: arrest = True: 5 (20/15)
location_description <= 109:
:...location_description <= 19:
:...location_description > 16:
: :...location_description <= 17: 11 (1754/1609)
: : location_description > 17: 19 (11235/10092)
: location_description <= 16:
: :...location_description > 2: 16 (1665/302)
: location_description <= 2:
: :...location_description <= 1: 19 (1757/1494)
: location_description > 1: 7 (79/61)
location_description > 19:
:...location_description <= 64:
:...location_description > 62:
: :...primary_type = DECEPTIVE PRACTICE: 18 (1288/771)
: : primary_type = THEFT: 1 (8184/5962)
: location_description <= 62:
: :...location_description <= 31:
: :...location_description > 25: 18 (2786/1577)
: : location_description <= 25:
: : :...arrest = True:
: : :...location_description > 23: 2 (70/55)
: : : location_description <= 23:
: : : :...location_description <= 20: 17 (41/26)
: : : location_description > 20: 18 (14/7)
: : arrest = False:
: : :...location_description <= 21:
: : :...location_description <= 20: 6 (77/66)
: : : location_description > 20: 1 (1059/830)
: : location_description > 21:
: : :...location_description <= 23: 19 (1204/985)
: : location_description > 23: [S24]
: location_description > 31:
: :...location_description > 55:
: :...location_description > 60: 6 (762/664)
: : location_description <= 60:
: : :...arrest = False: 1 (3966/2360)
: : arrest = True: [S25]
: location_description <= 55:
: :...location_description <= 50:
: :...location_description > 46:
: : :...location_description <= 48: 1 (213/137)
: : : location_description > 48: [S26]
: : location_description <= 46:
: : :...location_description > 34: [S27]
: : location_description <= 34: [S28]
: location_description > 50:
: :...location_description > 54: [S29]
: location_description <= 54:
: :...arrest = True: 6 (663/591)
: arrest = False: [S30]
location_description > 64:
:...location_description <= 77:
:...location_description > 74: 6 (3417/3032)
: location_description <= 74:
: :...location_description <= 65: 8 (979/817)
: location_description > 65:
: :...location_description <= 67: 1 (1995/1769)
: location_description > 67:
: :...location_description <= 69: 9 (87/72)
: location_description > 69:
: :...location_description <= 70: 1 (27/17)
: location_description > 70: 16 (54/43)
location_description > 77:
:...location_description <= 87:
:...location_description > 82: 18 (884/655)
: location_description <= 82:
: :...location_description <= 79: 1 (261/170)
: location_description > 79:
: :...primary_type = THEFT: 19 (5376/4774)
: primary_type = DECEPTIVE PRACTICE:
: :...arrest = False: 18 (438/392)
: arrest = True: 31 (24/17)
location_description > 87:
:...location_description <= 90: 18 (1670/972)
location_description > 90:
:...location_description > 107:
:...arrest = False: 1 (8715/7563)
: arrest = True: 8 (331/298)
location_description <= 107:
:...location_description > 102: 1 (1978/1288)
location_description <= 102:
:...location_description <= 93: [S31]
location_description > 93: [S32]
SubTree [S1]
location_description <= 25: 1 (12/8)
location_description > 25: 9 (12/9)
SubTree [S2]
location_description > 36: 11 (124/87)
location_description <= 36:
:...location_description <= 34: 11 (15/11)
location_description > 34: 5 (32/23)
SubTree [S3]
primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}: 18 (45/32)
primary_type = CRIMINAL DAMAGE: 19 (12/9)
SubTree [S4]
location_description <= 53: 7 (173/152)
location_description > 53: 1 (42/38)
SubTree [S5]
location_description <= 61: 6 (33/26)
location_description > 61: 8 (64/56)
SubTree [S6]
primary_type in {ASSAULT,VIOLENT CRIME}: 11 (784/701)
primary_type = ROBBERY:
:...arrest = False: 25 (492/451)
arrest = True: 7 (321/284)
SubTree [S7]
location_description <= 42: 15 (4/2)
location_description > 42:
:...primary_type = ASSAULT: 2 (84/71)
primary_type in {ROBBERY,VIOLENT CRIME}: 7 (303/272)
SubTree [S8]
location_description <= 58: 6 (297/213)
location_description > 58: 1 (87/69)
SubTree [S9]
primary_type in {ASSAULT,VIOLENT CRIME}: 9 (81/67)
primary_type = ROBBERY: 11 (54/39)
SubTree [S10]
location_description <= 70: 1 (43/30)
location_description > 70: 6 (29/24)
SubTree [S11]
primary_type in {ASSAULT,ROBBERY}: 1 (491/429)
primary_type = VIOLENT CRIME: 19 (119/100)
SubTree [S12]
primary_type in {ASSAULT,VIOLENT CRIME}: 8 (15/11)
primary_type = ROBBERY: 3 (15/12)
SubTree [S13]
primary_type in {ASSAULT,VIOLENT CRIME}: 1 (2984/2692)
primary_type = ROBBERY: 8 (1692/1525)
SubTree [S14]
location_description > 106: 11 (20/17)
location_description <= 106:
:...primary_type in {ASSAULT,VIOLENT CRIME}: 4 (1015/934)
primary_type = ROBBERY: 1 (419/376)
SubTree [S15]
primary_type = ROBBERY:
:...location_description <= 95: 1 (24/20)
: location_description > 95: 17 (44/35)
primary_type in {ASSAULT,CRIMINAL DAMAGE,VIOLENT CRIME}:
:...primary_type = VIOLENT CRIME: 1 (101/87)
primary_type = ASSAULT:
:...location_description <= 95: 12 (70/57)
: location_description > 95: 25 (67/60)
primary_type = CRIMINAL DAMAGE:
:...location_description <= 95: 16 (34/29)
location_description > 95: 19 (86/75)
SubTree [S16]
primary_type = ASSAULT: 11 (1106/1018)
primary_type = ROBBERY: 10 (754/695)
primary_type = VIOLENT CRIME: 19 (2046/1857)
SubTree [S17]
primary_type in {ASSAULT,VIOLENT CRIME}: 1 (1582/1388)
primary_type = ROBBERY:
:...arrest = False: 25 (1418/1283)
arrest = True: 1 (559/494)
SubTree [S18]
primary_type in {ASSAULT,VIOLENT CRIME}: 9 (100/79)
primary_type = ROBBERY: 12 (182/150)
SubTree [S19]
location_description > 138: 4 (91/77)
location_description <= 138:
:...arrest = False: 9 (5918/5476)
arrest = True: 4 (1141/1042)
SubTree [S20]
location_description > 143: 2 (29/23)
location_description <= 143:
:...location_description <= 141: 18 (15/11)
location_description > 141: 11 (33/27)
SubTree [S21]
location_description > 139: 18 (127/103)
location_description <= 139:
:...location_description <= 138: 4 (3/2)
location_description > 138:
:...primary_type = VIOLENT CRIME: 18 (210/172)
primary_type = ROBBERY:
:...arrest = False: 17 (105/93)
arrest = True: 4 (69/60)
SubTree [S22]
primary_type = ROBBERY:
:...arrest = False: 7 (476/398)
: arrest = True: 11 (45/38)
primary_type = VIOLENT CRIME:
:...arrest = False: 11 (99/76)
arrest = True: 7 (15/9)
SubTree [S23]
location_description <= 146: 18 (127/107)
location_description > 146:
:...location_description > 147: 18 (84/76)
location_description <= 147:
:...primary_type = ROBBERY: 8 (508/472)
primary_type = VIOLENT CRIME:
:...arrest = False: 25 (683/622)
arrest = True: 9 (95/86)
SubTree [S24]
location_description <= 24: 8 (91/77)
location_description > 24: 1 (1814/1549)
SubTree [S25]
primary_type = DECEPTIVE PRACTICE: 11 (301/183)
primary_type = THEFT: 1 (85/62)
SubTree [S26]
location_description <= 49: 1 (1772/1335)
location_description > 49: 12 (454/372)
SubTree [S27]
location_description <= 41: 5 (295/240)
location_description > 41: 2 (542/497)
SubTree [S28]
location_description <= 32: 1 (9/5)
location_description > 32:
:...location_description <= 33: 6 (134/109)
location_description > 33: 17 (11/7)
SubTree [S29]
primary_type = DECEPTIVE PRACTICE: 1 (32/10)
primary_type = THEFT: 8 (193/143)
SubTree [S30]
location_description <= 52: 1 (1654/1431)
location_description > 52:
:...location_description > 53: 7 (366/326)
location_description <= 53:
:...primary_type = DECEPTIVE PRACTICE: 1 (44/38)
primary_type = THEFT: 6 (730/661)
SubTree [S31]
location_description <= 92: 10 (12/3)
location_description > 92: 18 (114/71)
SubTree [S32]
location_description > 101: 1 (13339/11253)
location_description <= 101:
:...location_description > 99: 19 (498/430)
location_description <= 99:
:...location_description > 96: 18 (157/102)
location_description <= 96:
:...location_description <= 95: 1 (206/128)
location_description > 95: 12 (232/205)
Evaluation on training data (488358 cases):
Decision Tree
----------------
Size Errors
348 417035(85.4%) <<
Class Cases False False
Pos Neg
----- ----- ----- -----
1 32320 75133 12783
2 20764 1717 20547
3 20127 25258 16702
4 23909 2788 23609
5 18737 12496 17341
6 27195 9451 25880
7 22712 17537 20083
8 30292 88386 20639
9 21249 6165 20672
10 21641 2931 21139
11 32073 73020 17631
12 26996 50823 21783
14 20450 3969 20093
15 17498 730 17356
16 16846 774 14487
17 14975 314 14912
18 32110 11928 27449
19 25431 28462 21636
20 9202 56 9195
22 14605 101 14588
24 14664 1050 14379
25 24550 3929 24126
31 12 17 5
Attribute usage:
100.00% primary_type
100.00% location_description
42.20% arrest
Time: 0.5 secs
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
[1] "The classification error in test set is: 90.6715592723341 % 11389 correct classified cases from 122089"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 85.3953452180572 % 71323 correct classified cases from 488358"
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(arrest ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.9)) #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)
## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$arrest)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$arrest),
"correct classified cases from", length(predictions))
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$arrest)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$arrest),
"correct classified cases from", length(pred_train))
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#assault <- subset(assault, select=-c(location_description))
#assault_tr <- subset(assault_tr, select=-c(location_description))
assault_tr <- subset(assault_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )
assault <- subset(assault, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )
#train_c50<- subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(assault, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
crime_split_c50<- initial_split(subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)
train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.0001)) #Higher CF less prunning
summary(tree_result)
Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 1e-04))
C5.0 [Release 2.07 GPL Edition] Fri May 8 13:51:43 2020
-------------------------------
Class specified by attribute `outcome'
Read 21338 cases (4 attributes) from undefined.data
Decision tree:
location_description <= 13:
:...location_description <= 12: 8 (529/437)
: location_description > 12: 3 (2139/1703)
location_description > 13:
:...location_description <= 67:
:...location_description <= 28:
: :...location_description <= 25: 18 (182/150)
: : location_description > 25: 5 (267/200)
: location_description > 28:
: :...location_description > 55:
: :...location_description <= 56: 10 (127/102)
: : location_description > 56: 1 (1376/1111)
: location_description <= 55:
: :...location_description <= 51: 1 (1383/1125)
: location_description > 51:
: :...location_description <= 52: 6 (561/464)
: location_description > 52:
: :...location_description <= 53: 1 (130/72)
: location_description > 53: 6 (264/229)
location_description > 67:
:...location_description <= 75:
:...location_description <= 71: 4 (622/542)
: location_description > 71: 5 (3744/3222)
location_description > 75:
:...location_description <= 76: 1 (644/505)
location_description > 76:
:...location_description <= 81: 4 (1480/1288)
location_description > 81:
:...location_description > 83: 9 (4532/3983)
location_description <= 83:
:...location_description <= 82: 4 (2819/2481)
location_description > 82: 1 (539/441)
Evaluation on training data (21338 cases):
Decision Tree
----------------
Size Errors
17 18055(84.6%) <<
(a) (b) (c) (d) (e) (f) (g) (h) (i) (j) (k) (l) <-classified as
---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
818 43 245 87 33 10 262 5 24 (a): class 1
245 436 444 287 86 36 360 1 12 (b): class 3
339 252 610 525 69 56 482 22 12 (c): class 4
203 107 432 589 55 27 333 8 10 (d): class 5
441 334 514 419 132 42 417 20 (e): class 6
297 210 468 526 128 56 465 18 9 (f): class 7
396 141 525 382 53 92 492 17 20 (g): class 8
303 129 402 307 81 74 549 19 (h): class 9
221 173 461 267 47 60 432 25 9 (i): class 10
199 213 324 236 46 33 289 17 6 (j): class 15
439 72 221 142 47 18 206 14 32 (k): class 18
171 29 275 244 48 25 245 9 (l): class 22
Attribute usage:
100.00% location_description
Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
[1] "The classification error in test set is: 91.1886014248219 % 470 correct classified cases from 5334"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 84.6143031211922 % 3283 correct classified cases from 21338"
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#criminal_damage <- subset(criminal_damage, select=-c(location_description))
#criminal_damage_tr <- subset( criminal_damage_tr, select=-c(location_description))
criminal_damage_tr <- subset(criminal_damage_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="20" | district == "25" | district == "11")
criminal_damage <- subset(criminal_damage, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="20" | district == "25" | district == "11")
train_c50<- subset(criminal_damage_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(criminal_damage, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)
train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.001)) #Higher CF less prunning
summary(tree_result)
Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.001))
C5.0 [Release 2.07 GPL Edition] Wed May 6 16:43:00 2020
-------------------------------
Class specified by attribute `outcome'
Read 45588 cases (4 attributes) from undefined.data
Decision tree:
location_description in {ABANDONED BUILDING,
: AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA,
: AIRPORT BUILDING NON-TERMINAL - SECURE AREA,
: AIRPORT EXTERIOR - NON-SECURE AREA,
: AIRPORT EXTERIOR - SECURE AREA,AIRPORT PARKING LOT,
: AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,
: AIRPORT TERMINAL LOWER LEVEL - SECURE AREA,
: AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA,
: AIRPORT TERMINAL UPPER LEVEL - SECURE AREA,
: ANIMAL HOSPITAL,APPLIANCE STORE,CHA APARTMENT,
: CHA HALLWAY/STAIRWELL/ELEVATOR,
: CHA PARKING LOT/GROUNDS,
: CHURCH/SYNAGOGUE/PLACE OF WORSHIP,
: COIN OPERATED MACHINE,CREDIT UNION,
: CTA GARAGE / OTHER PROPERTY,CTA TRACKS - RIGHT OF WAY,
: CTA TRAIN,DRIVEWAY - RESIDENTIAL,
: FACTORY/MANUFACTURING BUILDING,FIRE STATION,
: FOREST PRESERVE,HIGHWAY/EXPRESSWAY,
: OTHER RAILROAD PROP / TRAIN DEPOT,PAWN SHOP,POOL ROOM,
: RESIDENCE,RESIDENCE PORCH/HALLWAY,RESIDENCE-GARAGE,
: RESIDENTIAL YARD (FRONT/BACK),SAVINGS AND LOAN,
: SCHOOL PUBLIC BUILDING,SCHOOL PUBLIC GROUNDS,
: VACANT LOT/LAND,
: VEHICLE - DELIVERY TRUCK}: 8 (14667/12108)
location_description in {APARTMENT,CTA BUS,CURRENCY EXCHANGE,
: GAS STATION}: 3 (5871/4778)
location_description in {ALLEY,ATHLETIC CLUB,ATM (AUTOMATIC TELLER MACHINE),
: AUTO / BOAT / RV DEALERSHIP,BANK,BAR OR TAVERN,
: BARBERSHOP,BOAT/WATERCRAFT,BOWLING ALLEY,BRIDGE,
: CAR WASH,CEMETARY,CLEANING STORE,
: COLLEGE/UNIVERSITY GROUNDS,
: COLLEGE/UNIVERSITY RESIDENCE HALL,
: COMMERCIAL / BUSINESS OFFICE,CONSTRUCTION SITE,
: CONVENIENCE STORE,CTA BUS STOP,CTA PLATFORM,
: CTA STATION,DAY CARE CENTER,DEPARTMENT STORE,
: DRUG STORE,FEDERAL BUILDING,
: GOVERNMENT BUILDING/PROPERTY,GROCERY FOOD STORE,
: HOSPITAL BUILDING/GROUNDS,HOTEL/MOTEL,
: JAIL / LOCK-UP FACILITY,
: LAKEFRONT/WATERFRONT/RIVERBANK,LIBRARY,
: MEDICAL/DENTAL OFFICE,MOVIE HOUSE/THEATER,
: NURSING HOME/RETIREMENT HOME,OTHER,
: OTHER COMMERCIAL TRANSPORTATION,PARK PROPERTY,
: PARKING LOT/GARAGE(NON.RESID.),
: POLICE FACILITY/VEH PARKING LOT,RESTAURANT,
: SCHOOL PRIVATE BUILDING,SCHOOL PRIVATE GROUNDS,
: SIDEWALK,SMALL RETAIL STORE,SPORTS ARENA/STADIUM,
: STREET,TAVERN/LIQUOR STORE,TAXICAB,
: VEHICLE - OTHER RIDE SERVICE,
: VEHICLE - OTHER RIDE SHARE SERVICE (),
: VEHICLE NON-COMMERCIAL,VEHICLE-COMMERCIAL,WAREHOUSE}:
:...location_description in {ALLEY,CAR WASH,DAY CARE CENTER,
: NURSING HOME/RETIREMENT HOME,SPORTS ARENA/STADIUM,
: STREET,VEHICLE NON-COMMERCIAL}: 8 (17364/15004)
location_description in {ATHLETIC CLUB,ATM (AUTOMATIC TELLER MACHINE),
AUTO / BOAT / RV DEALERSHIP,BANK,BAR OR TAVERN,
BARBERSHOP,BOAT/WATERCRAFT,BOWLING ALLEY,BRIDGE,
CEMETARY,CLEANING STORE,
COLLEGE/UNIVERSITY GROUNDS,
COLLEGE/UNIVERSITY RESIDENCE HALL,
COMMERCIAL / BUSINESS OFFICE,CONSTRUCTION SITE,
CONVENIENCE STORE,CTA BUS STOP,CTA PLATFORM,
CTA STATION,DEPARTMENT STORE,DRUG STORE,
FEDERAL BUILDING,GOVERNMENT BUILDING/PROPERTY,
GROCERY FOOD STORE,HOSPITAL BUILDING/GROUNDS,
HOTEL/MOTEL,JAIL / LOCK-UP FACILITY,
LAKEFRONT/WATERFRONT/RIVERBANK,LIBRARY,
MEDICAL/DENTAL OFFICE,MOVIE HOUSE/THEATER,OTHER,
OTHER COMMERCIAL TRANSPORTATION,PARK PROPERTY,
PARKING LOT/GARAGE(NON.RESID.),
POLICE FACILITY/VEH PARKING LOT,RESTAURANT,
SCHOOL PRIVATE BUILDING,SCHOOL PRIVATE GROUNDS,
SIDEWALK,SMALL RETAIL STORE,TAVERN/LIQUOR STORE,
TAXICAB,VEHICLE - OTHER RIDE SERVICE,
VEHICLE - OTHER RIDE SHARE SERVICE (),
VEHICLE-COMMERCIAL,WAREHOUSE}: 1 (7686/6506)
Evaluation on training data (45588 cases):
Decision Tree
----------------
Size Errors
4 38396(84.2%) <<
(a) (b) (c) (d) (e) (f) (g) (h) (i) (j) (k) (l) <-classified as
---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
1180 65 1004 (a): class 1
451 1093 2304 (b): class 3
582 611 3429 (c): class 4
402 308 2987 (d): class 5
613 878 3080 (e): class 6
354 584 2820 (f): class 7
896 438 4919 (g): class 8
648 420 2969 (h): class 9
548 773 2873 (i): class 11
1051 96 1497 (j): class 18
341 173 835 (k): class 20
620 432 3314 (l): class 25
Attribute usage:
100.00% location_description
Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
Error in model.frame.default(object$Terms, newdata, na.action = na.action, :
factor location_description has new levels FARM
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#deceptive_practice <- subset(deceptive_practice, select=-c(location_description))
#deceptive_practice_tr <- subset(deceptive_practice_tr, select=-c(location_description))
deceptive_practice_tr <- subset(deceptive_practice_tr, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20" | district == "9" | district=="21" | district == "25" | district == "11" )
deceptive_practice <- subset(deceptive_practice, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20" | district == "9" | district=="21" | district == "25" | district == "11" )
#train_c50<- subset(deceptive_practice_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
crime_split_c50<- initial_split(subset(deceptive_practice_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)
#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.5)) #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
[1] "The classification error in test set is: 79.3173028417689 % 1230 correct classified cases from 5947"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 78.8584398117014 % 5030 correct classified cases from 23792"
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#narcotics <- subset(narcotics, select=-c(location_description))
#narcotics_tr <- subset(narcotics_tr, select=-c(location_description))
#narcotics_tr <- subset(narcotics_tr, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")
#narcotics <- subset(narcotics, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")
#train_c50<- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
crime_split_c50<- initial_split(subset(narcotics_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)
#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01)) #Higher CF less prunning
summary(tree_result)
Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.01))
C5.0 [Release 2.07 GPL Edition] Fri May 8 13:54:54 2020
-------------------------------
Class specified by attribute `outcome'
Read 16510 cases (3 attributes) from undefined.data
Decision tree:
11 (16510/7867)
Evaluation on training data (16510 cases):
Decision Tree
----------------
Size Errors
1 7867(47.6%) <<
(a) (b) (c) (d) (e) (f) <-classified as
---- ---- ---- ---- ---- ----
364 (a): class 1
1199 (b): class 8
3178 (c): class 10
8643 (d): class 11
2769 (e): class 15
357 (f): class 18
Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
[1] "The classification error in test set is: 48.1948146353283 % 2138 correct classified cases from 4127"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 47.6499091459721 % 8643 correct classified cases from 16510"
## Creating a training and test datasets
set.seed(1234)
#robbery <- subset(robbery, select=-c(location_description))
#robbery_tr <- subset(robbery_tr, select=-c(location_description))
robbery_tr <- subset(robbery_tr, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")
robbery <- subset(robbery, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")
#robbery$location_description <- gsub("PARKING LOT","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKING LOT","PARKING",robbery_tr$location_description)
#robbery$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery_tr$location_description)
train_c50<- subset(robbery_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(robbery, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)
train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
train_c50$location_description <- factor(train_c50$location_description)
test_c50$location_description <- factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~., data=train_c50, control = C5.0Control(noGlobalPruning = TRUE, CF= 0.000001)) #Higher CF less prunning
summary(tree_result)
Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = TRUE, CF = 1e-06))
C5.0 [Release 2.07 GPL Edition] Wed May 6 17:02:50 2020
-------------------------------
Class specified by attribute `outcome'
Read 41594 cases (4 attributes) from undefined.data
Decision tree:
arrest = True:
:...location_description > 73:
: :...location_description > 77:
: : :...location_description <= 85: 1 (1478/1201)
: : : location_description > 85: 11 (738/598)
: : location_description <= 77:
: : :...location_description > 75: 8 (359/272)
: : location_description <= 75:
: : :...location_description <= 74: 5 (550/441)
: : location_description > 74: 11 (170/134)
: location_description <= 73:
: :...location_description > 54:
: :...location_description > 66:
: : :...location_description <= 69: 1 (139/100)
: : : location_description > 69: 6 (451/340)
: : location_description <= 66:
: : :...location_description > 56: 1 (648/338)
: : location_description <= 56:
: : :...location_description <= 55: 1 (2/1)
: : location_description > 55: 18 (101/38)
: location_description <= 54:
: :...location_description <= 37:
: :...location_description > 12: 1 (885/655)
: : location_description <= 12:
: : :...location_description > 6: 6 (469/380)
: : location_description <= 6:
: : :...location_description <= 2: 5 (58/39)
: : location_description > 2: 8 (6)
: location_description > 37:
: :...location_description > 51: 11 (1040/784)
: location_description <= 51:
: :...location_description > 39: 1 (343/242)
: location_description <= 39:
: :...location_description <= 38: 1 (70/41)
: location_description > 38: 6 (297/185)
arrest = False:
:...location_description > 82: 11 (11217/9162)
location_description <= 82:
:...location_description > 70: 8 (11291/8218)
location_description <= 70:
:...location_description <= 11:
:...location_description <= 9: 11 (1201/932)
: location_description > 9: 6 (4744/3827)
location_description > 11:
:...location_description <= 25:
:...location_description > 22: 5 (188/108)
: location_description <= 22:
: :...location_description > 17: 8 (85/69)
: location_description <= 17:
: :...location_description <= 16: 1 (139/102)
: location_description > 16: 18 (60/32)
location_description > 25:
:...location_description > 63: 8 (1904/1546)
location_description <= 63:
:...location_description > 54:
:...location_description <= 58: 18 (159/89)
: location_description > 58: 11 (67/53)
location_description <= 54:
:...location_description > 46: 11 (660/489)
location_description <= 46:
:...location_description > 36: 1 (904/637)
location_description <= 36:
:...location_description > 32: 11 (584/482)
location_description <= 32:
:...location_description > 28: 18 (473/350)
location_description <= 28:
:...location_description <= 26: 11 (91/75)
location_description > 26: 8 (23/13)
Evaluation on training data (41594 cases):
Decision Tree
----------------
Size Errors
35 31973(76.9%) <<
(a) (b) (c) (d) (e) (f) (g) (h) (i) <-classified as
---- ---- ---- ---- ---- ---- ---- ---- ----
1291 28 249 649 1145 173 (a): class 1
341 102 1023 1905 1586 64 (b): class 4
315 208 314 1598 1344 36 (c): class 5
450 90 1229 1796 2295 61 (d): class 6
390 90 666 3550 1944 64 (e): class 8
416 80 722 1601 1497 42 (f): class 9
435 99 799 1081 3059 44 (g): class 11
184 45 574 699 1652 25 (h): class 15
786 54 385 789 1246 284 (i): class 18
Attribute usage:
100.00% location_description
100.00% arrest
Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
[1] "The classification error in test set is: 82.5898772193269 % 1971 correct classified cases from 11321"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 76.8692599894216 % 9621 correct classified cases from 41594"
## Creating a training and test datasets
set.seed(1234)
theft <- subset(theft, select=-c(location_description))
theft_tr <- subset(theft_tr, select=-c(location_description))
theft_tr <- subset(theft_tr, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")
theft <- subset(theft, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")
train_c50<- subset(theft_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(theft, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)
train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01)) #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)
## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
set.seed(1234)
#violent_crime <- subset(violent_crime, select=-c(location_description))
#violent_tr_crime <- subset(violent_tr_crime, select=-c(location_description))
violent_tr_crime <- subset(violent_tr_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20" | district == "10" | district=="21" | district == "25" | district == "2")
violent_crime <- subset(violent_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20" | district == "10" | district=="21" | district == "25" | district == "2")
train_c50<- subset(violent_tr_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(violent_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)
train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, trials = 10,control = C5.0Control(noGlobalPruning = FALSE, CF= 0.000001)) #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL,trial=9)
Only 1 trials are in the model. Setting 'trial' to 0 (the plot code is zero-based).

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
Error in model.frame.default(object$Terms, newdata, na.action = na.action, :
factor location_description has new levels , ATM (AUTOMATIC TELLER MACHINE), CHA PLAY LOT, HORSE STABLE, TRAILER, VEHICLE - DELIVERY TRUCK, VEHICLE-COMMERCIAL - TROLLEY BUS
set.seed(1234)
#weapons_violation <- subset(weapons_violation, select=-c(location_description))
#weapons_violation_tr <- subset(weapons_violation_tr, select=-c(location_description))
weapons_violation_tr <- subset(weapons_violation_tr, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")
weapons_violation <- subset(weapons_violation, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")
train_c50<- subset(weapons_violation_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(weapons_violation, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)
train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)
#Creating the decision tree algorithm C4.5
tree_result <- C5.0(district ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01)) #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)
## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)
paste("The classification error in test set is:", 100*error_classification, "%",
sum(predictions==test_c50$district),
"correct classified cases from", length(predictions))
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)
error_classification <- mean(pred_train != train_c50$district)
paste("The classification error in train set is:", 100*error_classification, "%",
sum(pred_train==train_c50$district),
"correct classified cases from", length(pred_train))
#Tunning the m number of predictors
tuning_rf_mtry <- function(df, y, ntree = 100){
require(dplyr)
max_predictors <- ncol(df) - 1
n_predictors <- rep(NA, max_predictors)
oob_err_rate <- rep(NA, max_predictors)
for (i in 1:max_predictors) {
set.seed(123)
f <- formula(paste(y,"~ ."))
model_rf <- randomForest(formula = f, data = df, mtry = i, ntree = ntree)
n_predictors[i] <- i
oob_err_rate[i] <- tail(model_rf$err.rate[,1], n = 1)
}
results <- data_frame(n_predictors, oob_err_rate)
return(results)
}
hiperparameter_mtry <- tuning_rf_mtry(df = RF_train, y = "primary_type")
Error in oob_err_rate[i] <- tail(model_rf$err.rate[, 1], n = 1) :
replacement has length zero
REDES NEURONALES
## REDES
chicago_crime_nn <- subset(chicago_crime_subset, select=-c(case_number,block,ward,day,latitude,longitude))
chicago_crime_nn_tr <- subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,day,latitude,longitude))
library(kohonen) # for building the SOM map
library(caret) #for confusion matrix
#Extracting target variable
target_train <- chicago_crime_nn_tr$primary_type
chicago_crime_nn_tr <- subset(chicago_crime_nn_tr, select=-c(primary_type))
target_test <- chicago_crime_nn$primary_type
chicago_crime_nn <- subset(chicago_crime_nn, select=-c(primary_type))
chicago_crime_nn_tr$arrest <- as.factor(chicago_crime_nn_tr$arrest)
chicago_crime_nn_tr$arrest <- as.numeric(chicago_crime_nn_tr$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr$district <- as.numeric(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
chicago_crime_nn_tr$description <- as.factor(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$description <- as.numeric(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$location_description <- as.factor(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$location_description <- as.numeric(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$month <- as.numeric(chicago_crime_nn_tr$month)
#chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)
chicago_crime_nn$arrest <- as.factor(chicago_crime_nn$arrest)
chicago_crime_nn$arrest <- as.numeric(chicago_crime_nn$arrest)
#chicago_crime_nn$primary_type <- as.numeric(chicago_crime_nn$primary_type)
chicago_crime_nn$district <- as.numeric(chicago_crime_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
chicago_crime_nn$description <- as.factor(chicago_crime_nn$description)
chicago_crime_nn$description <- as.numeric(chicago_crime_nn$description)
chicago_crime_nn$location_description <- as.factor(chicago_crime_nn$location_description)
chicago_crime_nn$location_description <- as.numeric(chicago_crime_nn$location_description)
chicago_crime_nn$month <- as.numeric(chicago_crime_nn$month)
#chicago_crime_nn$month <- as.factor(chicago_crime_nn$month)
#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
train <- chicago_crime_nn_tr[index,]
test <- chicago_crime_nn_tr[-index,]
#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)
#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]
#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
#training the map
crime.som <- som(train[1:100000,], grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
# Names of the variables used
colnames(train)
[1] "description" "location_description" "arrest" "district" "month"
# main characteristics of the map
summary(crime.som)
SOM of size 8x8 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 100000 objects.
Mean distance to the closest unit in the map: 78.082.
#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")
coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6,
alpha=alpha)[n:1]}
par(mfrow=c(2,3))

plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
main=colnames(getCodes(crime.som, 1))[1],
palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
main=colnames(getCodes(crime.som, 1))[2],
palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
main=colnames(getCodes(crime.som, 1))[3],
palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,4],
main=colnames(getCodes(crime.som, 1))[4],
palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,5],
main=colnames(getCodes(crime.som, 1))[5],
palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,6],
# main=colnames(getCodes(crime.som, 1))[6],
# palette.name=coolBlueHOtRed)
# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6,
alpha=alpha)[n:1]}
par(mfrow=c(5,3))

for (j in 1:ncol(train)) {
plot(crime.som, type="property", property=crime.som$codes[[1]][,j],
palette.name=coolBlueHotRed,
main=colnames(train)[j], cex=0.5)
}

#som.prediction <- predict(crime.som, test)
#Clustering patterns in the map
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = rainbow(groups)[crime.hc],
main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
#index <- sample(nrow(target_train.sc), round(0.75*nrow(crime.sc)))
#train <- target_train.sc
#test <- target_test.sc
#train <- chicago_crime_nn_tr
#test <- chicago_crime_nn
#index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
#train <- chicago_crime_nn_tr[index,]
#test <- chicago_crime_nn_tr[-index,]
#train_label<-target_train
#test_label<-target_test
#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c("Crime"))


plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c(" patterns"))


# Plotting classes in neurons
plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics
train_label ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
ASSAULT 4120 0 15 0 1442 3264 23831 0
CRIMINAL DAMAGE 327 52639 1472 2 3088 322 259 120
DECEPTIVE PRACTICE 246 458 10222 0 9272 14838 4850 24
NARCOTICS 94 0 206 16574 621 9233 1396 0
ROBBERY 555 17038 1937 0 30716 8157 8735 688
THEFT 130 1345 1644 0 2375 142814 12255 0
VIOLENT CRIME 4098 56 894 70 1847 7334 45993 20
WEAPONS VIOLATION 78 3119 1 84 1459 93 774 4591
Overall Statistics
Accuracy : 0.672
95% CI : (0.6706, 0.6734)
No Information Rate : 0.4064
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5821
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity 0.427032 0.7051 0.62363 0.99068 0.60441 0.7676
Specificity 0.936294 0.9854 0.93275 0.97382 0.90882 0.9347
Pos Pred Value 0.126102 0.9040 0.25613 0.58932 0.45286 0.8895
Neg Pred Value 0.986998 0.9449 0.98524 0.99964 0.94845 0.8545
Prevalence 0.021073 0.1631 0.03580 0.03654 0.11100 0.4064
Detection Rate 0.008999 0.1150 0.02233 0.03620 0.06709 0.3119
Detection Prevalence 0.071362 0.1272 0.08717 0.06143 0.14815 0.3507
Balanced Accuracy 0.681663 0.8453 0.77819 0.98225 0.75662 0.8511
Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity 0.4689 0.84347
Specificity 0.9602 0.98760
Pos Pred Value 0.7626 0.45014
Neg Pred Value 0.8689 0.99810
Prevalence 0.2143 0.01189
Detection Rate 0.1005 0.01003
Detection Prevalence 0.1317 0.02228
Balanced Accuracy 0.7145 0.91554
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics
test_label ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
ASSAULT 1293 0 11 0 482 1117 7937 0
CRIMINAL DAMAGE 100 17536 532 0 1044 86 93 38
DECEPTIVE PRACTICE 79 146 3319 0 3092 5130 1598 6
NARCOTICS 33 0 74 5598 207 3013 466 0
ROBBERY 219 5663 632 0 10195 2713 2872 242
THEFT 51 430 567 0 831 47387 4179 0
VIOLENT CRIME 1374 21 310 17 624 2447 15460 9
WEAPONS VIOLATION 35 1015 0 24 507 28 224 1506
Overall Statistics
Accuracy : 0.6703
95% CI : (0.6679, 0.6726)
No Information Rate : 0.4057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.58
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity 0.406093 0.7068 0.60955 0.99273 0.6003 0.7653
Specificity 0.936110 0.9852 0.93170 0.97419 0.9090 0.9332
Pos Pred Value 0.119280 0.9026 0.24824 0.59610 0.4524 0.8866
Neg Pred Value 0.986662 0.9454 0.98473 0.99971 0.9478 0.8534
Prevalence 0.020863 0.1626 0.03568 0.03695 0.1113 0.4057
Detection Rate 0.008472 0.1149 0.02175 0.03668 0.0668 0.3105
Detection Prevalence 0.071030 0.1273 0.08761 0.06154 0.1477 0.3502
Balanced Accuracy 0.671101 0.8460 0.77063 0.98346 0.7547 0.8492
Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity 0.4709 0.836202
Specificity 0.9599 0.987846
Pos Pred Value 0.7630 0.451033
Neg Pred Value 0.8688 0.998024
Prevalence 0.2151 0.011801
Detection Rate 0.1013 0.009868
Detection Prevalence 0.1328 0.021879
Balanced Accuracy 0.7154 0.912024
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics
test_label ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
ASSAULT 1293 0 11 0 482 1117 7937 0
CRIMINAL DAMAGE 100 17536 532 0 1044 86 93 38
DECEPTIVE PRACTICE 79 146 3319 0 3092 5130 1598 6
NARCOTICS 33 0 74 5598 207 3013 466 0
ROBBERY 219 5663 632 0 10195 2713 2872 242
THEFT 51 430 567 0 831 47387 4179 0
VIOLENT CRIME 1374 21 310 17 624 2447 15460 9
WEAPONS VIOLATION 35 1015 0 24 507 28 224 1506
Overall Statistics
Accuracy : 0.6703
95% CI : (0.6679, 0.6726)
No Information Rate : 0.4057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.58
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity 0.406093 0.7068 0.60955 0.99273 0.6003 0.7653
Specificity 0.936110 0.9852 0.93170 0.97419 0.9090 0.9332
Pos Pred Value 0.119280 0.9026 0.24824 0.59610 0.4524 0.8866
Neg Pred Value 0.986662 0.9454 0.98473 0.99971 0.9478 0.8534
Prevalence 0.020863 0.1626 0.03568 0.03695 0.1113 0.4057
Detection Rate 0.008472 0.1149 0.02175 0.03668 0.0668 0.3105
Detection Prevalence 0.071030 0.1273 0.08761 0.06154 0.1477 0.3502
Balanced Accuracy 0.671101 0.8460 0.77063 0.98346 0.7547 0.8492
Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity 0.4709 0.836202
Specificity 0.9599 0.987846
Pos Pred Value 0.7630 0.451033
Neg Pred Value 0.8688 0.998024
Prevalence 0.2151 0.011801
Detection Rate 0.1013 0.009868
Detection Prevalence 0.1328 0.021879
Balanced Accuracy 0.7154 0.912024
## REDES
chicago_crime_nn2 <- subset(chicago_crime_subset, select=-c(description,case_number,block,day,latitude,longitude))
chicago_crime_nn_tr2 <- subset(chicago_crime_subset_tr, select=-c(description,case_number,block,day,latitude,longitude))
library(kohonen) # for building the SOM map
library(caret) #for confusion matrix
#Extracting target variable
target_train2 <- chicago_crime_nn_tr2$primary_type
chicago_crime_nn_tr2 <- subset(chicago_crime_nn_tr2, select=-c(primary_type))
target_test2 <- chicago_crime_nn2$primary_type
chicago_crime_nn2 <- subset(chicago_crime_nn2, select=-c(primary_type))
chicago_crime_nn_tr2$arrest <- as.factor(chicago_crime_nn_tr2$arrest)
chicago_crime_nn_tr2$arrest <- as.numeric(chicago_crime_nn_tr2$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr2$district <- as.numeric(chicago_crime_nn_tr2$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr2$description <- as.factor(chicago_crime_nn_tr2$description)
#chicago_crime_nn_tr2$description <- as.numeric(chicago_crime_nn_tr2$description)
chicago_crime_nn_tr2$location_description <- as.factor(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$location_description <- as.numeric(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$month <- as.numeric(chicago_crime_nn_tr2$month)
chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)
chicago_crime_nn2$arrest <- as.factor(chicago_crime_nn2$arrest)
chicago_crime_nn2$arrest <- as.numeric(chicago_crime_nn2$arrest)
#chicago_crime_nn$primary_type <- as.numeric(chicago_crime_nn$primary_type)
chicago_crime_nn2$district <- as.numeric(chicago_crime_nn2$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
#chicago_crime_nn2$description <- as.factor(chicago_crime_nn2$description)
#chicago_crime_nn2$description <- as.numeric(chicago_crime_nn2$description)
chicago_crime_nn2$location_description <- as.factor(chicago_crime_nn2$location_description)
chicago_crime_nn2$location_description <- as.numeric(chicago_crime_nn2$location_description)
chicago_crime_nn2$month <- as.numeric(chicago_crime_nn2$month)
chicago_crime_nn$month <- as.factor(chicago_crime_nn$month)
#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr2), round(0.75*nrow(chicago_crime_nn_tr2)))
train <- chicago_crime_nn_tr2[index,]
test <- chicago_crime_nn_tr2[-index,]
#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)
#train_label<-target_train
#test_label<-target_test
train_label<-target_train2[index]
test_label<-target_train2[-index]
#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
#training the map
crime.som <- som(train[1:100000,], grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
# Names of the variables used
colnames(train)
[1] "location_description" "arrest" "district" "ward" "month"
# main characteristics of the map
summary(crime.som)
SOM of size 8x8 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 100000 objects.
Mean distance to the closest unit in the map: 31.207.
#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")
coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6,
alpha=alpha)[n:1]}
par(mfrow=c(2,3))

plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
main=colnames(getCodes(crime.som, 1))[1],
palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
main=colnames(getCodes(crime.som, 1))[2],
palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
main=colnames(getCodes(crime.som, 1))[3],
palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,4],
# main=colnames(getCodes(crime.som, 1))[4],
# palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,5],
# main=colnames(getCodes(crime.som, 1))[5],
# palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,6],
# main=colnames(getCodes(crime.som, 1))[6],
# palette.name=coolBlueHOtRed)
# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6,
alpha=alpha)[n:1]}
par(mfrow=c(5,3))

for (j in 1:ncol(train)) {
plot(crime.som, type="property", property=crime.som$codes[[1]][,j],
palette.name=coolBlueHotRed,
main=colnames(train)[j], cex=0.5)
}

#Clustering patterns in the map
library(RColorBrewer)
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = brewer.pal(groups, name="YlGnBu")[crime.hc],
main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
#index <- sample(nrow(target_train.sc), round(0.75*nrow(crime.sc)))
#train <- target_train.sc
#test <- target_test.sc
#train <- chicago_crime_nn_tr
#test <- chicago_crime_nn
#index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
#train <- chicago_crime_nn_tr[index,]
#test <- chicago_crime_nn_tr[-index,]
#train_label<-target_train
#test_label<-target_test
#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c("Crime"))


plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c(" patterns"))


# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
# col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)
#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics
train_label ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
ASSAULT 0 0 327 1453 5590 22496 2806 0
CRIMINAL DAMAGE 0 0 853 663 11935 44081 697 0
DECEPTIVE PRACTICE 0 0 2343 640 8667 27933 327 0
NARCOTICS 0 0 87 5353 3029 18090 1565 0
ROBBERY 0 0 1297 2509 16747 44776 2497 0
THEFT 0 0 1322 1954 12329 142118 2840 0
VIOLENT CRIME 0 0 590 3873 10745 39590 5514 0
WEAPONS VIOLATION 0 0 31 649 1778 7098 643 0
Overall Statistics
Accuracy : 0.3758
95% CI : (0.3744, 0.3772)
No Information Rate : 0.7561
P-Value [Acc > NIR] : 1
Kappa : 0.1127
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity NA NA 0.342044 0.31315 0.23647 0.4105
Specificity 0.92864 0.8728 0.916700 0.94833 0.86802 0.8348
Pos Pred Value NA NA 0.058707 0.19034 0.24691 0.8851
Neg Pred Value NA NA 0.989216 0.97268 0.86135 0.3135
Prevalence 0.00000 0.0000 0.014962 0.03734 0.15468 0.7561
Detection Rate 0.00000 0.0000 0.005118 0.01169 0.03658 0.3104
Detection Prevalence 0.07136 0.1272 0.087171 0.06143 0.14815 0.3507
Balanced Accuracy NA NA 0.629372 0.63074 0.55225 0.6227
Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity 0.32648 NA
Specificity 0.87573 0.97772
Pos Pred Value 0.09142 NA
Neg Pred Value 0.97139 NA
Prevalence 0.03689 0.00000
Detection Rate 0.01204 0.00000
Detection Prevalence 0.13173 0.02228
Balanced Accuracy 0.60111 NA
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics
test_label ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
ASSAULT 0 0 91 514 1938 7375 922 0
CRIMINAL DAMAGE 0 0 300 221 3912 14800 196 0
DECEPTIVE PRACTICE 0 0 763 219 3003 9284 101 0
NARCOTICS 0 0 44 1846 972 6030 499 0
ROBBERY 0 0 413 849 5544 14908 822 0
THEFT 0 0 425 680 4163 47221 956 0
VIOLENT CRIME 0 0 185 1277 3599 13357 1844 0
WEAPONS VIOLATION 0 0 15 244 583 2289 208 0
Overall Statistics
Accuracy : 0.3749
95% CI : (0.3725, 0.3774)
No Information Rate : 0.7553
P-Value [Acc > NIR] : 1
Kappa : 0.1122
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity NA NA 0.34123 0.31556 0.23379 0.4097
Specificity 0.92897 0.8727 0.91616 0.94859 0.86817 0.8334
Pos Pred Value NA NA 0.05707 0.19657 0.24601 0.8835
Neg Pred Value NA NA 0.98942 0.97204 0.86031 0.3139
Prevalence 0.00000 0.0000 0.01465 0.03833 0.15539 0.7553
Detection Rate 0.00000 0.0000 0.00500 0.01210 0.03633 0.3094
Detection Prevalence 0.07103 0.1273 0.08761 0.06154 0.14767 0.3502
Balanced Accuracy NA NA 0.62870 0.63207 0.55098 0.6215
Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity 0.33237 NA
Specificity 0.87476 0.97812
Pos Pred Value 0.09101 NA
Neg Pred Value 0.97201 NA
Prevalence 0.03635 0.00000
Detection Rate 0.01208 0.00000
Detection Prevalence 0.13277 0.02188
Balanced Accuracy 0.60357 NA
## REDES
narcotics_nn <- subset(narcotics_tr, select=-c(case_number,block,day,latitude,longitude))
library(kohonen) # for building the SOM map
library(caret) #for confusion matrix
#Extracting target variable
target_train <- narcotics_nn$district
narcotics_nn <- subset(narcotics_nn, select=-c(district))
narcotics_nn$arrest <- as.factor(narcotics_nn$arrest)
narcotics_nn$arrest <- as.numeric(narcotics_nn$arrest)
narcotics_nn$primary_type <- as.factor(narcotics_nn$primary_type)
narcotics_nn$primary_type <- as.numeric(narcotics_nn$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
narcotics_nn$description <- as.factor(narcotics_nn$description)
narcotics_nn$description <- as.numeric(narcotics_nn$description)
narcotics_nn$location_description <- as.factor(narcotics_nn$location_description)
narcotics_nn$location_description <- as.numeric(narcotics_nn$location_description)
narcotics_nn$month <- as.numeric(narcotics_nn$month)
#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
index <- sample(nrow(narcotics_nn), round(0.75*nrow(narcotics_nn)))
train <- narcotics_nn[index,]
test <- narcotics_nn[-index,]
#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)
#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]
#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
#training the map
crime.som <- som(train, grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
# Names of the variables used
colnames(train)
[1] "primary_type" "description" "location_description" "arrest" "ward"
[6] "month"
# main characteristics of the map
summary(crime.som)
SOM of size 8x8 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 28136 objects.
Mean distance to the closest unit in the map: 27.532.
#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
#index <- sample(nrow(target_train.sc), round(0.75*nrow(crime.sc)))
#train <- target_train.sc
#test <- target_test.sc
#train <- chicago_crime_nn_tr
#test <- chicago_crime_nn
#index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
#train <- chicago_crime_nn_tr[index,]
#test <- chicago_crime_nn_tr[-index,]
#train_label<-target_train
#test_label<-target_test
#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c("Crime"))


plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c(" patterns"))


# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
# col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)
#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics
train_label 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 17 18 19 20 22 24 25 31
1 34 28 0 127 0 17 25 7 0 0 13 0 0 19 28 0 32 16 0 0 14 0 0
2 11 163 6 200 0 23 102 31 0 30 4 0 0 0 0 0 1 0 0 0 0 0 0
3 6 98 28 215 0 26 174 32 0 205 42 0 0 0 0 0 1 0 0 0 0 0 0
4 11 103 0 458 0 24 438 19 125 0 3 0 0 0 0 0 2 0 0 0 0 0 0
5 22 3 2 190 23 27 283 14 90 26 135 0 0 119 15 0 2 0 0 0 3 127 0
6 12 69 22 242 0 95 552 43 31 326 178 0 0 0 0 0 2 0 0 0 0 2 0
7 18 61 0 104 0 45 1282 51 183 139 231 0 0 0 0 0 0 0 0 0 0 1 0
8 3 0 0 5 0 9 527 55 216 179 147 0 0 0 0 0 0 0 0 0 0 4 0
9 12 28 8 33 0 23 477 24 306 115 81 0 0 0 0 0 0 0 0 0 0 0 0
10 3 0 11 3 0 10 69 55 35 1749 1059 0 0 3 0 0 1 0 0 0 0 5 0
11 1 66 4 98 2 1 85 54 0 1094 6437 0 0 120 47 0 2 2 0 0 9 52 0
12 24 55 0 102 0 16 41 10 0 55 321 0 0 7 2 0 2 0 0 0 1 7 0
14 1 19 0 28 5 4 22 5 0 17 79 0 0 15 2 0 0 2 0 0 0 30 0
15 0 0 1 0 6 0 16 4 0 186 1547 0 0 563 65 0 3 10 0 0 17 176 0
16 0 0 0 0 0 0 0 1 0 0 83 0 0 45 248 0 29 22 0 0 55 82 0
17 0 0 0 0 0 0 1 0 0 0 31 0 0 56 31 0 5 16 0 0 15 100 0
18 0 1 0 2 1 0 1 0 0 5 95 0 0 39 62 0 69 26 0 0 38 8 0
19 0 1 0 1 0 0 2 0 0 0 9 0 0 15 53 0 29 77 0 0 111 5 0
20 0 0 0 0 0 0 0 0 0 0 6 0 0 14 29 0 28 34 0 0 67 19 0
22 0 0 17 4 10 12 17 4 2 176 124 0 0 44 3 0 0 0 0 0 0 56 0
24 0 0 0 0 0 0 0 0 0 0 1 0 0 3 47 0 21 40 0 0 173 4 0
25 0 0 0 0 11 0 4 1 0 22 553 0 0 243 34 0 1 18 0 0 19 299 0
31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Overall Statistics
Accuracy : 0.4286
95% CI : (0.4228, 0.4344)
No Information Rate : 0.3973
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3245
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity 0.215190 0.234532 0.2828283 0.25276 0.3965517 0.286145 0.31132 0.134146 0.30972 0.40449 0.5758 NA
Specificity 0.988348 0.985132 0.9715019 0.97246 0.9623193 0.946806 0.96532 0.960687 0.97050 0.94734 0.9035 0.97715
Pos Pred Value 0.094444 0.285464 0.0338573 0.38715 0.0212766 0.060356 0.60615 0.048035 0.27642 0.58242 0.7973 NA
Neg Pred Value 0.995536 0.980700 0.9974001 0.94976 0.9987063 0.991077 0.89101 0.986847 0.97477 0.89755 0.7636 NA
Prevalence 0.005616 0.024701 0.0035186 0.06440 0.0020614 0.011800 0.14636 0.014572 0.03512 0.15368 0.3973 0.00000
Detection Rate 0.001208 0.005793 0.0009952 0.01628 0.0008175 0.003376 0.04556 0.001955 0.01088 0.06216 0.2288 0.00000
Detection Prevalence 0.012795 0.020294 0.0293929 0.04205 0.0384205 0.055943 0.07517 0.040695 0.03934 0.10673 0.2870 0.02285
Balanced Accuracy 0.601769 0.609832 0.6271651 0.61261 0.6794355 0.616475 0.63832 0.547417 0.64011 0.67591 0.7396 NA
Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity NA 0.43142 0.372372 NA 0.300000 0.292776 NA NA 0.331418 0.30604 NA
Specificity 0.991861 0.92430 0.988460 0.990937 0.990038 0.991892 0.992998 0.98333 0.995799 0.96664 1
Pos Pred Value NA 0.21704 0.438938 NA 0.198847 0.254125 NA NA 0.598616 0.24813 NA
Neg Pred Value NA 0.97095 0.984839 NA 0.994206 0.993317 NA NA 0.987467 0.97482 NA
Prevalence 0.000000 0.04638 0.023671 0.000000 0.008175 0.009347 0.000000 0.00000 0.018553 0.03472 0
Detection Rate 0.000000 0.02001 0.008814 0.000000 0.002452 0.002737 0.000000 0.00000 0.006149 0.01063 0
Detection Prevalence 0.008139 0.09220 0.020081 0.009063 0.012333 0.010769 0.007002 0.01667 0.010272 0.04283 0
Balanced Accuracy NA 0.67786 0.680416 NA 0.645019 0.642334 NA NA 0.663608 0.63634 NA
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics
test_label 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 17 18 19 20 22 24 25 31
1 11 6 0 49 0 1 3 1 0 0 1 0 0 2 8 0 10 4 0 0 3 0 0
2 4 53 0 76 0 11 28 9 0 9 2 0 0 0 0 0 0 0 0 0 0 0 0
3 2 42 7 66 0 5 65 13 0 75 17 0 0 0 0 0 0 0 0 0 0 0 0
4 4 42 0 145 0 6 140 6 37 0 1 0 0 0 0 0 0 0 0 0 0 1 0
5 1 1 0 65 6 6 120 3 38 10 44 0 0 36 2 0 0 0 0 0 2 43 0
6 3 23 10 70 0 26 178 16 20 109 65 0 0 0 0 0 0 0 0 0 0 0 0
7 1 10 1 37 0 13 406 17 65 33 95 0 0 0 0 0 0 0 0 0 0 0 0
8 1 0 0 1 0 1 145 11 74 57 51 0 0 0 0 0 0 0 0 0 0 2 0
9 5 8 4 14 0 13 169 8 131 31 22 0 0 0 0 0 0 0 0 0 0 0 0
10 1 0 6 0 0 6 20 13 13 576 359 0 0 1 0 0 1 0 0 0 0 0 0
11 0 21 1 28 1 0 27 25 0 368 2148 0 0 41 19 0 0 0 0 0 1 27 0
12 5 25 0 22 0 7 19 3 0 24 94 0 0 0 1 0 0 0 0 0 0 1 0
14 0 9 0 4 0 2 3 3 0 2 42 0 0 7 1 0 0 0 0 0 0 17 0
15 0 0 1 0 5 0 6 2 0 61 505 0 0 202 17 0 1 6 0 0 5 60 0
16 0 0 0 0 2 0 0 1 0 0 32 0 0 12 70 0 10 12 0 0 19 30 0
17 0 0 0 0 0 0 0 0 0 0 13 0 0 13 9 0 4 5 0 0 4 36 0
18 0 0 0 0 0 0 0 0 0 1 20 0 0 19 16 0 17 11 0 0 13 1 0
19 0 0 0 0 0 0 2 0 0 0 4 0 0 5 19 0 14 32 0 0 53 2 0
20 0 0 0 0 0 0 1 0 0 0 3 0 0 3 6 0 7 16 0 0 25 8 0
22 0 0 8 1 2 2 5 0 2 67 36 0 0 12 2 0 0 0 0 0 0 21 0
24 0 0 0 0 0 0 0 0 0 0 1 0 0 0 13 0 9 20 0 0 61 2 0
25 0 0 0 0 3 0 2 0 0 4 176 0 0 92 17 0 0 7 0 0 9 82 0
31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Overall Statistics
Accuracy : 0.4248
95% CI : (0.4147, 0.4349)
No Information Rate : 0.3978
P-Value [Acc > NIR] : 5.519e-08
Kappa : 0.3199
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity 0.289474 0.220833 0.1842105 0.25087 0.3157895 0.262626 0.30321 0.083969 0.34474 0.40364 0.5757 NA
Specificity 0.990579 0.984790 0.9694893 0.97307 0.9603632 0.946767 0.96617 0.964100 0.96955 0.94718 0.9010 0.97857
Pos Pred Value 0.111111 0.276042 0.0239726 0.37958 0.0159151 0.050000 0.59882 0.032070 0.32346 0.57831 0.7935 NA
Neg Pred Value 0.997091 0.979645 0.9965885 0.95187 0.9985559 0.991760 0.89277 0.986720 0.97225 0.89849 0.7627 NA
Prevalence 0.004052 0.025589 0.0040516 0.06163 0.0020258 0.010555 0.14277 0.013967 0.04052 0.15215 0.3978 0.00000
Detection Rate 0.001173 0.005651 0.0007463 0.01546 0.0006397 0.002772 0.04329 0.001173 0.01397 0.06141 0.2290 0.00000
Detection Prevalence 0.010555 0.020471 0.0311334 0.04073 0.0401962 0.055443 0.07229 0.036571 0.04318 0.10619 0.2886 0.02143
Balanced Accuracy 0.640026 0.602812 0.5768499 0.61197 0.6380764 0.604697 0.63469 0.524035 0.65714 0.67541 0.7384 NA
Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity NA 0.45393 0.350000 NA 0.232877 0.283186 NA NA 0.312821 0.246246 NA
Specificity 0.990404 0.92512 0.987145 0.991044 0.991296 0.989316 0.992643 0.98315 0.995100 0.965731 1
Pos Pred Value NA 0.23192 0.372340 NA 0.173469 0.244275 NA NA 0.575472 0.209184 NA
Neg Pred Value NA 0.97144 0.985856 NA 0.993966 0.991241 NA NA 0.985549 0.972071 NA
Prevalence 0.000000 0.04745 0.021324 0.000000 0.007783 0.012048 0.000000 0.00000 0.020791 0.035505 0
Detection Rate 0.000000 0.02154 0.007463 0.000000 0.001813 0.003412 0.000000 0.00000 0.006504 0.008743 0
Detection Prevalence 0.009596 0.09287 0.020045 0.008956 0.010449 0.013967 0.007357 0.01685 0.011302 0.041796 0
Balanced Accuracy NA 0.68953 0.668572 NA 0.612086 0.636251 NA NA 0.653960 0.605988 NA
## REDES
crimen <- subset(weapons_violation_tr, select=-c(case_number,block,day,latitude,longitude))
library(kohonen) # for building the SOM map
library(caret) #for confusion matrix
#Extracting target variable
target_train <- crimen$district
crimen <- subset(crimen, select=-c(district))
crimen$arrest <- as.factor(crimen$arrest)
crimen$arrest <- as.numeric(crimen$arrest)
crimen$primary_type <- as.factor(crimen$primary_type)
crimen$primary_type <- as.numeric(crimen$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
crimen$description <- as.factor(crimen$description)
crimen$description <- as.numeric(crimen$description)
crimen$location_description <- as.factor(crimen$location_description)
crimen$location_description <- as.numeric(crimen$location_description)
crimen$month <- as.numeric(crimen$month)
#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)
# creation of training and test datasets
index <- sample(nrow(crimen), round(0.75*nrow(crimen)))
train <- crimen[index,]
test <- crimen[-index,]
#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)
#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]
#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")
#training the map
crime.som <- som(train, grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
# Names of the variables used
colnames(train)
[1] "primary_type" "description" "location_description" "arrest" "ward"
[6] "month"
# main characteristics of the map
summary(crime.som)
SOM of size 12x12 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 10154 objects.
Mean distance to the closest unit in the map: 7.334.
#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

#Scaling original data
set.seed(7)
#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
grid=som_grid,
rlen=100, alpha=c(0.05, 0.01),
radius= 2, keep.data=T)
#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c("Crime"))


plot(kohmap, type="codes", codeRendering = "lines", shape="straight",
main=c(" patterns"))


# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
# col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)
#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics
train_label 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 17 18 19 20 22 24 25 31
1 7 28 8 7 3 4 0 0 0 1 1 4 1 0 2 1 1 5 0 0 0 0 0
2 4 141 66 26 1 38 10 0 0 3 0 0 1 0 0 0 0 0 0 0 0 0 0
3 1 48 178 103 7 152 27 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 29 34 496 104 55 9 0 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 3 2 262 462 15 12 0 11 0 0 0 0 7 0 3 1 0 0 0 0 43 0
6 0 23 81 166 30 285 303 0 2 44 2 0 0 0 0 0 0 0 0 0 0 0 0
7 0 28 51 55 1 100 883 25 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 24 2 1 13 302 43 42 52 1 0 0 0 0 0 0 0 0 0 0 0 0
9 7 38 39 51 18 42 233 15 118 8 3 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 9 5 41 15 0 58 675 69 0 0 9 0 0 0 0 0 0 0 3 0
11 0 38 5 4 13 5 2 0 0 336 560 2 4 134 0 0 5 0 0 0 0 53 0
12 7 37 10 6 0 1 0 0 0 74 68 5 3 3 0 1 2 0 0 0 0 3 0
14 2 14 3 1 28 2 0 0 0 26 43 2 9 10 0 1 1 0 0 0 0 20 0
15 0 0 0 0 66 1 0 0 0 39 257 0 0 183 0 2 2 1 0 0 0 88 0
16 0 0 0 0 24 0 0 0 0 0 3 0 0 4 13 1 5 22 0 0 7 21 0
17 0 0 0 0 49 0 0 0 0 0 13 0 0 9 2 15 1 3 0 0 4 29 0
18 0 4 0 0 2 0 0 0 0 4 29 0 0 3 4 8 10 19 0 0 0 9 0
19 1 1 0 0 3 0 0 0 0 0 5 0 0 0 1 0 3 45 0 0 32 2 0
20 0 0 0 0 8 0 0 0 0 0 0 0 0 2 5 3 2 14 0 0 32 9 0
22 0 0 26 10 104 90 20 0 1 30 4 0 0 7 0 2 0 0 0 0 0 14 0
24 0 0 0 0 3 0 0 0 0 0 1 0 0 2 3 2 1 11 0 0 106 10 0
25 0 2 0 0 120 1 0 0 0 10 156 0 0 83 0 4 0 1 0 0 0 190 0
31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Overall Statistics
Accuracy : 0.4357
95% CI : (0.426, 0.4454)
No Information Rate : 0.1788
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3855
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity 0.2413793 0.32488 0.33776 0.41402 0.43916 0.33728 0.48623 0.518072 0.46457 0.51487 0.46091 0.3846154
Specificity 0.9934815 0.98467 0.96416 0.97175 0.96056 0.93007 0.96858 0.956608 0.95414 0.97637 0.93277 0.9787989
Pos Pred Value 0.0958904 0.48621 0.34034 0.66222 0.56273 0.30449 0.77118 0.089583 0.20629 0.76357 0.48234 0.0227273
Neg Pred Value 0.9978177 0.97030 0.96376 0.92536 0.93678 0.93925 0.89644 0.995865 0.98581 0.93139 0.92717 0.9991947
Prevalence 0.0028560 0.04274 0.05190 0.11798 0.10360 0.08322 0.17885 0.008174 0.02501 0.12911 0.11966 0.0012803
Detection Rate 0.0006894 0.01389 0.01753 0.04885 0.04550 0.02807 0.08696 0.004235 0.01162 0.06648 0.05515 0.0004924
Detection Prevalence 0.0071893 0.02856 0.05151 0.07376 0.08085 0.09218 0.11276 0.047272 0.05633 0.08706 0.11434 0.0216663
Balanced Accuracy 0.6174304 0.65478 0.65096 0.69289 0.69986 0.63367 0.72741 0.737340 0.70935 0.74562 0.69684 0.6817072
Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity 0.5000000 0.40132 0.433333 0.348837 0.2941176 0.371901 NA NA 0.58564 0.38462 NA
Specificity 0.9849053 0.95298 0.991407 0.989121 0.9918972 0.995216 0.992614 0.96967 0.99669 0.96097 1
Pos Pred Value 0.0555556 0.28638 0.130000 0.120000 0.1086957 0.483871 NA NA 0.76259 0.33510 NA
Neg Pred Value 0.9990993 0.97131 0.998309 0.997208 0.9976148 0.992446 NA NA 0.99251 0.96829 NA
Prevalence 0.0017727 0.04491 0.002955 0.004235 0.0033484 0.011916 0.000000 0.00000 0.01783 0.04865 0
Detection Rate 0.0008864 0.01802 0.001280 0.001477 0.0009848 0.004432 0.000000 0.00000 0.01044 0.01871 0
Detection Prevalence 0.0159543 0.06293 0.009848 0.012310 0.0090605 0.009159 0.007386 0.03033 0.01369 0.05584 0
Balanced Accuracy 0.7424526 0.67715 0.712370 0.668979 0.6430074 0.683558 NA NA 0.79116 0.67279 NA
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics
test_label 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 17 18 19 20 22 24 25 31
1 2 14 1 4 0 0 0 0 0 0 1 0 1 0 0 3 0 5 0 0 0 2 0
2 1 38 19 9 1 14 4 0 0 1 0 1 3 0 0 0 0 0 0 0 0 0 0
3 0 19 46 25 4 47 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 12 7 174 35 23 8 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 1 2 89 131 11 3 0 6 0 2 0 0 2 0 1 0 0 0 0 0 16 0
6 0 8 31 61 9 103 102 0 1 18 0 0 0 0 0 0 0 0 0 0 0 0 0
7 0 13 20 27 1 43 324 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 12 0 2 3 98 20 18 14 0 0 0 0 0 0 0 0 0 0 0 0 0
9 4 18 12 14 12 14 65 3 26 1 2 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 2 4 20 7 0 17 188 32 0 0 5 0 0 0 0 0 0 0 0 0
11 1 8 3 0 5 7 1 0 0 106 179 0 1 48 0 1 1 0 0 0 0 13 0
12 5 14 3 3 0 1 0 0 0 22 21 1 2 1 0 0 1 0 0 0 0 3 0
14 0 9 3 0 6 1 0 0 0 8 19 2 1 0 0 1 0 0 0 0 0 4 0
15 0 0 0 0 16 1 0 0 0 15 84 0 0 55 0 1 0 1 0 0 0 32 0
16 0 0 0 0 6 0 0 0 0 0 2 0 0 3 4 1 1 7 0 0 2 6 0
17 0 0 0 0 22 0 0 0 0 0 3 0 0 4 1 3 0 1 0 0 0 16 0
18 0 1 0 0 0 0 0 0 0 0 10 0 0 0 1 4 5 6 0 0 0 3 0
19 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 14 0 0 12 1 0
20 0 0 0 0 3 0 0 0 0 0 0 0 0 1 3 3 0 3 0 0 7 1 0
22 0 0 7 2 30 28 4 0 1 17 0 0 0 1 0 0 0 0 0 0 0 3 0
24 0 0 0 0 2 0 0 0 0 0 0 0 0 1 2 0 1 7 0 0 21 4 0
25 0 0 0 0 33 0 0 0 0 4 39 0 0 38 0 0 0 0 0 0 0 73 0
31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Overall Statistics
Accuracy : 0.4161
95% CI : (0.3994, 0.4329)
No Information Rate : 0.1859
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3626
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity 0.153846 0.24359 0.27711 0.42439 0.40432 0.32595 0.51510 0.58824 0.342105 0.47716 0.4543 0.2500000
Specificity 0.990804 0.98358 0.96644 0.96907 0.95654 0.92503 0.95826 0.95612 0.956167 0.97090 0.9348 0.9775148
Pos Pred Value 0.060606 0.41758 0.29870 0.65414 0.49621 0.30931 0.73804 0.11976 0.152047 0.68364 0.4786 0.0129870
Neg Pred Value 0.996717 0.96417 0.96285 0.92431 0.93814 0.93019 0.89643 0.99565 0.984438 0.93374 0.9286 0.9990928
Prevalence 0.003842 0.04610 0.04905 0.12116 0.09574 0.09338 0.18587 0.01005 0.022459 0.11643 0.1164 0.0011820
Detection Rate 0.000591 0.01123 0.01359 0.05142 0.03871 0.03044 0.09574 0.00591 0.007683 0.05556 0.0529 0.0002955
Detection Prevalence 0.009752 0.02689 0.04551 0.07861 0.07801 0.09840 0.12973 0.04935 0.050532 0.08126 0.1105 0.0227541
Balanced Accuracy 0.572325 0.61359 0.62177 0.69673 0.68043 0.62549 0.73668 0.77218 0.649136 0.72403 0.6945 0.6137574
Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity 0.1250000 0.34591 0.363636 0.1666667 0.555556 0.318182 NA NA 0.500000 0.41243 NA
Specificity 0.9843009 0.95349 0.991699 0.9860368 0.992593 0.995210 0.993794 0.97252 0.994913 0.96445 1
Pos Pred Value 0.0185185 0.26829 0.125000 0.0600000 0.166667 0.466667 NA NA 0.552632 0.39037 NA
Neg Pred Value 0.9978979 0.96729 0.997912 0.9955009 0.998807 0.991055 NA NA 0.993724 0.96747 NA
Prevalence 0.0023641 0.04699 0.003251 0.0053191 0.002660 0.013002 0.000000 0.00000 0.012411 0.05230 0
Detection Rate 0.0002955 0.01625 0.001182 0.0008865 0.001478 0.004137 0.000000 0.00000 0.006206 0.02157 0
Detection Prevalence 0.0159574 0.06058 0.009456 0.0147754 0.008865 0.008865 0.006206 0.02748 0.011229 0.05526 0
Balanced Accuracy 0.5546505 0.64970 0.677668 0.5763518 0.774074 0.656696 NA NA 0.747457 0.68844 NA
library(RColorBrewer)
groups<-23
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = heat.colors(groups)[crime.hc],
main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)

---
title: "R Notebook"
output: html_notebook
---

## READ THE DATA

```{r}
library(dplyr)
library(data.table)
library(mltools)
chicago_crime <- read.table(file = "chicago_crime_clean.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)

chicago_crime$location_description <- (gsub(","," ",chicago_crime$location_description))
chicago_crime$description <- gsub(":=","",chicago_crime$description)
chicago_crime$description <- gsub(":","",chicago_crime$description)
chicago_crime$description <- gsub("MANU/POSS. W/","",chicago_crime$description)
chicago_crime$description <- gsub(",","",chicago_crime$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime$location_description <- gsub("(E.G.  UBER  LYFT)","",chicago_crime$location_description)
chicago_crime$location_description <- gsub(",","",chicago_crime$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)

chicago_crime <- chicago_crime %>%
    dplyr::mutate(year = lubridate::year(date), 
                month = lubridate::month(date), 
                day = lubridate::day(date))
chicago_crime <- na.omit(chicago_crime)
chicago_crime <- select(chicago_crime,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))
chicago_crime$district <- factor(chicago_crime$district)

unique(chicago_crime$primary_type)

head(chicago_crime)
summary(chicago_crime)
```

## Read the training set

```{r}
library(dplyr)
library(data.table)
library(mltools)
chicago_crime_tr <- read.table(file = "chicago_crime_tr.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)

chicago_crime_tr$location_description <- (gsub(","," ",chicago_crime_tr$location_description))
chicago_crime_tr$description <- gsub(":=","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(":","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub("MANU/POSS. W/","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(",","",chicago_crime_tr$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime_tr$location_description <- gsub("(E.G.  UBER  LYFT)","",chicago_crime_tr$location_description)
chicago_crime_tr$location_description <- gsub(",","",chicago_crime_tr$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)

chicago_crime_tr <- chicago_crime_tr %>%
    dplyr::mutate(year = lubridate::year(date), 
                month = lubridate::month(date), 
                day = lubridate::day(date))

chicago_crime_tr <- select(chicago_crime_tr,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))

chicago_crime_tr <- na.omit(chicago_crime_tr)
chicago_crime_tr$district <- factor(chicago_crime_tr$district)

unique(chicago_crime_tr$primary_type)

head(chicago_crime_tr)
summary(chicago_crime_tr)

chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime_tr$primary_type <- factor(chicago_crime_tr$primary_type)

chicago_crime_subset_tr <- subset(chicago_crime_tr, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )

chicago_crime_subset_tr$primary_type <- factor(chicago_crime_subset_tr$primary_type)
chicago_crime_subset_tr <- na.omit(chicago_crime_subset_tr)
library(DataExplorer)
plot_str(chicago_crime_subset_tr)
plot_missing(chicago_crime_subset_tr)
#plot_histogram(chicago_crime_subset)
#plot_density(chicago_crime_subset)
#plot_correlation(chicago_numeric, type = 'continuous')
chicago_crime_subset_tr$month <- as.factor(chicago_crime_subset_tr$month)

plot_bar(chicago_crime_subset_tr)
```
## EXPLORATORY ANALYSIS

```{r}
library(tidyverse)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

chicago_crime %>% 
  count(primary_type)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1))

chicago_crime %>% 
  count(district)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = arrest)) +
  theme(axis.text.x = element_text(hjust = 1))

chicago_crime %>% 
  count(arrest)

#chicago_crime$primary_type <- as.character(junk$nm)
chicago_crime$primary_type[chicago_crime$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime$primary_type[chicago_crime$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime$primary_type <- factor(chicago_crime$primary_type)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

chicago_crime %>% 
  count(primary_type)

chicago_crime_subset <- subset(chicago_crime, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )
chicago_crime_subset$primary_type <- factor(chicago_crime_subset$primary_type)
ggplot(data = chicago_crime_subset) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = primary_type, y = district)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = arrest, y = primary_type)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = arrest, y = district)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

```

## EXPLORATORY ANALYSIS BY CRIME

```{r}
assault <- subset(chicago_crime_subset, primary_type=="ASSAULT")
violent_crime <- subset(chicago_crime_subset, primary_type=="VIOLENT CRIME")
theft <- subset(chicago_crime_subset, primary_type=="THEFT")
narcotics <- subset(chicago_crime_subset, primary_type=="NARCOTICS")
weapons_violation <- subset(chicago_crime_subset, primary_type=="WEAPONS VIOLATION")
robbery <- subset(chicago_crime_subset, primary_type=="ROBBERY")
criminal_damage <- subset(chicago_crime_subset, primary_type=="CRIMINAL DAMAGE")
deceptive_practice <- subset(chicago_crime_subset, primary_type=="DECEPTIVE PRACTICE")

assault_tr <- subset(chicago_crime_subset_tr, primary_type=="ASSAULT")
violent_tr_crime <- subset(chicago_crime_subset_tr, primary_type=="VIOLENT CRIME")
theft_tr <- subset(chicago_crime_subset_tr, primary_type=="THEFT")
narcotics_tr <- subset(chicago_crime_subset_tr, primary_type=="NARCOTICS")
weapons_violation_tr <- subset(chicago_crime_subset_tr, primary_type=="WEAPONS VIOLATION")
robbery_tr <- subset(chicago_crime_subset_tr, primary_type=="ROBBERY")
criminal_damage_tr <- subset(chicago_crime_subset_tr, primary_type=="CRIMINAL DAMAGE")
deceptive_practice_tr <- subset(chicago_crime_subset_tr, primary_type=="DECEPTIVE PRACTICE")
```

## DISTRICTS

```{r}
library(sqldf)

districts_true <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as arrest FROM chicago_crime_subset WHERE arrest LIKE "True" GROUP BY district ORDER BY district')
districts_false <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as no_arrest FROM chicago_crime_subset WHERE arrest LIKE "False" GROUP BY district ORDER BY district')
districts_true$arrest <- as.numeric(districts_true$arrest)
districts_false$no_arrest <- as.numeric(districts_false$no_arrest)
districts_true
districts_false

police_districts <- read.table(file = "Police_Stations.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)
police_districts

police_districts$DISTRICT[police_districts$DISTRICT == "Headquarters"] <- "0"
police_districts$DISTRICT <- as.factor(police_districts$DISTRICT)

districts <- sqldf('SELECT DISTRICT as district, LATITUDE as latitude,LONGITUDE as longitude FROM police_districts')

arrest_percentage <- data.frame('District' = districts_false$district, 'PctArrest' = districts_true$arrest/(districts_true$arrest + districts_false$no_arrest), 'Crimes' = (districts_true$arrest + districts_false$no_arrest))
arrest_percentage

ggplot(data = arrest_percentage) +
  geom_col(mapping = aes(x = District, y = Crimes)) +
  geom_line(aes(x = District, y = PctArrest*10000, group = 1), color = "yellow") +
  scale_y_continuous(sec.axis = sec_axis(~./10000, name = "PctArrest")) +
  theme(axis.text.x = element_text(hjust = 1))

## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
# library("maptools")
library("KernSmooth")

setDT(districts_false)

#devtools::install_github("dkahle/ggmap", ref = "tidyup", force = TRUE)
library(ggmap)
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949, 
                                  right = -87.2713, top = 42.0677), 
                         zoom = 11)
ggmap(chicago) +
geom_text(aes(x = longitude, y = latitude, label = district), data = districts)
```


```{r}

library(ggmap)
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949, 
                                  right = -87.2713, top = 42.0677), 
                         zoom = 11)
ggmap(chicago) +
geom_text(aes(x = LONGITUDE, y = LATITUDE, label = DISTRICT), data = police_districts)
```

```{r}
ggplot(data = assault) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("ASSAULT BY DISTRICT")

ggplot(data = theft) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("THEFTS BY DISTRICT")

ggplot(data = violent_crime) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("VIOLENT CRIMES BY DISTRICT")

ggplot(data = narcotics) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("NARCOTIC CRIMES BY DISTRICT")

ggplot(data = weapons_violation) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("WEAPON-RELATED CRIMES BY DISTRICT")

ggplot(data = robbery) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("ROBBERIES BY DISTRICT")

ggplot(data = criminal_damage) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("CRIMINAL DAMAGE CRIMES BY DISTRICT")

ggplot(data = deceptive_practice) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("DECEPTIVE PRACTICE CRIMES BY DISTRICT")
```
```{r}
library(ggplot2)

ggplot(data = chicago_crime_subset, aes(x=primary_type, y=district, fill=arrest)) + 
  geom_tile() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```

```{r}
# Correlation
library(ggplot2)
ggplot(chicago_crime_subset,aes(x=district,y=primary_type,color=arrest))+geom_point(alpha=0.5)
```

## Association Rules

```{r}
chicago_crime_subset_2 <- subset(chicago_crime_subset, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_subset_2 <- subset(chicago_crime_subset_2, select=-c(location_description))
write.csv(chicago_crime_subset_2,"chicago_crime_AR.csv", quote = FALSE, row.names = FALSE)
library(arules)
crime_transactions <- read.transactions("chicago_crime_AR.csv", sep=",")

#deceptive_practice_2 <- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#write.csv(deceptive_practice_2,"deceptive_practice.csv", quote = FALSE, row.names = FALSE)
#dp_transactions <- read.transactions("deceptive_practice.csv", sep=",")
```
```{r}
if (!require("RColorBrewer")) {
  # install color package of R
  install.packages("RColorBrewer")
  #include library RColorBrewer
  library(RColorBrewer)
}

itemFrequencyPlot(crime_transactions,topN=20,type="absolute",
                  col=brewer.pal(8,'Pastel2'), 
                  main="Absolute Item Frequency Plot")
```
## Reglas de Asociacion General
```{r}
# Rule GENERATION
association.rules.clean <- apriori(crime_transactions, parameter = list(supp=0.001, conf=0.7))
subset.rules.clean <- which(colSums(is.subset(association.rules.clean, association.rules.clean)) > 1)
subset.association.rules.clean. <- association.rules.clean[-subset.rules.clean]
inspect(subset.association.rules.clean.)

rules_by_count <- sort(association.rules.clean, by = "count")
rules_by_conf <- sort(association.rules.clean, by = "confidence")
rules_by_supp <- sort(association.rules.clean, by = "lift")
inspect(rules_by_count)
inspect(rules_by_conf)
inspect(rules_by_supp)
```
```{r}
# Rule GENERATION
assault.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                   appearance = list(default="lhs",rhs="ASSAULT"))
# Borrar reglas redundantes
assault.subset.rules <- which(colSums(is.subset(assault.association.rules, assault.association.rules)) > 1) # get subset rules in vector
assault.subset.association.rules. <- assault.association.rules[-assault.subset.rules] # remove subset rules.
inspect(assault.subset.association.rules.)

as_by_count <- sort(assault.association.rules, by = "count")
as_by_conf <- sort(assault.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(as_by_count)
inspect(as_by_conf)
#inspect(dp_by_supp)
```
```{r}
# Rule GENERATION
cd.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                     appearance = list(default="lhs",rhs="CRIMINAL DAMAGE"))
# Borrar reglas redundantes
cd.subset.rules <- which(colSums(is.subset(cd.association.rules, cd.association.rules)) > 1) # get subset rules in vector
cd.subset.association.rules. <- cd.association.rules[-cd.subset.rules] # remove subset rules.
inspect(cd.association.rules)

cd_by_count <- sort(cd.association.rules, by = "count")
cd_by_conf <- sort(cd.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(cd_by_count)
inspect(cd_by_conf)
#inspect(dp_by_supp)
```
```{r}
# Rule GENERATION
dp.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                     appearance = list(default="lhs",rhs="DECEPTIVE PRACTICE"))
# Borrar reglas redundantes
dp.subset.rules <- which(colSums(is.subset(dp.association.rules, dp.association.rules)) > 1) # get subset rules in vector
dp.subset.association.rules. <- dp.association.rules[-dp.subset.rules] # remove subset rules.
inspect(dp.subset.association.rules.)

dp_by_count <- sort(dp.subset.association.rules., by = "count")
dp_by_conf <- sort(dp.subset.association.rules., by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(dp_by_count)
inspect(dp_by_conf)
#inspect(dp_by_supp)
```
```{r}
narcotics_clean.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                   appearance = list(default="lhs",rhs="NARCOTICS"))
# Borrar reglas redundantes
narcotics_clean.subset.rules <- which(colSums(is.subset(narcotics_clean.association.rules, narcotics_clean.association.rules)) > 1) # get subset rules in vector
narcotics_clean.subset.association.rules. <- narcotics_clean.association.rules[-narcotics_clean.subset.rules] # remove subset rules.
inspect(narcotics_clean.subset.association.rules.)

narc_by_count <- sort(narcotics_clean.association.rules, by = "count")
narc_by_conf <- sort(narcotics_clean.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(narc_by_count)
inspect(narc_by_conf)
#inspect(dp_by_supp)
```
```{r}
robbery.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.15),
                                   appearance = list(default="lhs",rhs="ROBBERY"))
# Borrar reglas redundantes
robbery.subset.rules <- which(colSums(is.subset(robbery.association.rules, robbery.association.rules)) > 1) 
robbery.subset.association.rules. <- robbery.association.rules[-robbery.subset.rules] # remove subset rules.
inspect(robbery.association.rules)

rob_by_count <- sort(robbery.association.rules, by = "count")
rob_by_conf <- sort(robbery.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(rob_by_count)
inspect(rob_by_conf)
#inspect(dp_by_supp)
```
```{r}
theft.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.005, conf=0.5),
                                   appearance = list(default="lhs",rhs="THEFT"))
# Borrar reglas redundantes
theft.subset.rules <- which(colSums(is.subset(theft.association.rules, theft.association.rules)) > 1) 
theft.subset.association.rules. <- theft.association.rules[-theft.subset.rules] # remove subset rules.
inspect(theft.subset.association.rules.)

theft_by_count <- sort(theft.association.rules, by = "count")
theft_by_conf <- sort(theft.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(theft_by_count)
inspect(theft_by_conf)
#inspect(dp_by_supp)
```
```{r}
vc.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.15),
                                   appearance = list(default="lhs",rhs="VIOLENT CRIME"))
# Borrar reglas redundantes
vc.subset.rules <- which(colSums(is.subset(vc.association.rules, vc.association.rules)) > 1) # get subset rules in  
vc.subset.association.rules. <- vc.association.rules[-vc.subset.rules] # remove subset rules.
inspect(vc.subset.association.rules.)

vc_by_count <- sort(vc.association.rules, by = "count")
vc_by_conf <- sort(vc.association.rules, by = "confidence")
#vc_by_supp <- sort(vc.subset.association.rules., by = "support")
inspect(vc_by_count)
inspect(vc_by_conf)
#inspect(wv_by_supp)
```
```{r}
wv.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.1),
                                      appearance = list(default="lhs",rhs="WEAPONS VIOLATION"))
# Borrar reglas redundantes
wv.subset.rules <- which(colSums(is.subset(wv.association.rules, wv.association.rules)) > 1) # get subset rules in  
wv.subset.association.rules. <- wv.association.rules[-wv.subset.rules] # remove subset rules.
inspect(wv.subset.association.rules.)

wv_by_count <- sort(wv.association.rules, by = "count")
wv_by_conf <- sort(wv.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(wv_by_count)
inspect(wv_by_conf)
#inspect(wv_by_supp)
```
```{r}
true.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.5),
                                      appearance = list(default="lhs",rhs="True"))
# Borrar reglas redundantes
true.subset.rules <- which(colSums(is.subset(true.association.rules, true.association.rules)) > 1) # get subset rules in  
true.subset.association.rules. <- true.association.rules[-true.subset.rules] # remove subset rules.
inspect(true.subset.association.rules.)

t_by_count <- sort(true.subset.association.rules., by = "count")
t_by_conf <- sort(true.subset.association.rules., by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(t_by_count)
inspect(t_by_conf)
#inspect(wv_by_supp)
```
```{r}
false.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.8),
                                      appearance = list(default="lhs",rhs="False"))
# Borrar reglas redundantes
false.subset.rules <- which(colSums(is.subset(false.association.rules, false.association.rules)) > 1) # get subset rules in  
false.subset.association.rules. <- false.association.rules[-false.subset.rules] # remove subset rules.
inspect(false.subset.association.rules.)

f_by_count <- sort(false.association.rules, by = "count")
f_by_conf <- sort(false.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(f_by_count)
inspect(f_by_conf)
#inspect(wv_by_supp)
```
```{r}
ocho.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.0001, conf=0.01),
                                      appearance = list(default="lhs",rhs="8"))
# Borrar reglas redundantes
ocho.subset.rules <- which(colSums(is.subset(ocho.association.rules, ocho.association.rules)) > 1) # get subset rules in  
ocho.subset.association.rules. <- ocho.association.rules[-ocho.subset.rules] # remove subset rules.
inspect(ocho.subset.association.rules.)

ocho_by_count <- sort(ocho.association.rules, by = "count")
ocho_by_conf <- sort(ocho.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(ocho_by_count)
inspect(ocho_by_conf)
#inspect(wv_by_supp)
```

```{r}
## GRAFICOS 
## Dataset Entero
library(arulesViz)
# Filter rules with confidence greater than 0.4 or 40%
subRules<-association.rules.clean[quality(association.rules.clean)$confidence>0.7]
#Plot SubRules
plot(subRules,method="two-key plot")

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(subRules, n = 25, by = "confidence")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(top10subRules, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules, n=25, by="confidence")
plot(subRules2, method="paracoord")

```
```{r}
#Plot SubRules
plot(assault.subset.association.rules.,method="two-key plot")

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(assault.subset.association.rules., n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(assault.subset.association.rules., method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(assault.subset.association.rules., n=20, by="confidence")
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(cd.association.rules,method="two-key plot")

subRules_cd<-cd.association.rules[quality(cd.association.rules)$confidence>0.2]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(cd.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_cd, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_cd, n=25, by="confidence")
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(dp.association.rules,method="two-key plot")

subRules_dp<-dp.association.rules[quality(dp.association.rules)$confidence>0.1]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(dp.association.rules, n = 10, by = "count")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_dp, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_dp, n=25, by="count")
plot(subRules_dp, method="paracoord")
```
```{r}
#Plot SubRules
plot(narcotics_clean.association.rules,method="two-key plot")

subRules_narcotics<-narcotics_clean.association.rules[quality(narcotics_clean.association.rules)$confidence>0.6]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(narcotics_clean.association.rules, n = 10, by = "confidence")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_narcotics, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_narcotics, n=25, by="confidence")
plot(subRules_narcotics, method="paracoord")
```
```{r}
#Plot SubRules
plot(robbery.association.rules,method="two-key plot")

subRules_robbery<-robbery.association.rules[quality(robbery.association.rules)$confidence>0.15]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(robbery.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_robbery, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_robbery, n=25, by="confidence")
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(theft.association.rules,method="two-key plot")

subRules_theft<-theft.association.rules[quality(theft.association.rules)$confidence>0.45]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(theft.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_theft, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(vc.association.rules,method="two-key plot")

subRules_vc<-vc.association.rules[quality(vc.association.rules)$confidence>0.15]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(vc.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_vc, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(wv.association.rules,method="two-key plot")

subRules_wv<-wv.association.rules[quality(wv.association.rules)$confidence>0.1]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(wv.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_wv, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```

```{r}
#Plot SubRules
plot(ocho.association.rules,method="two-key plot")

subRules_8<-ocho.association.rules[quality(ocho.association.rules)$confidence>0.01]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(ocho.association.rules, n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_8, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```

## Mapas de Densidad

```{r}
## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
# library("maptools")
library("KernSmooth")
library(viridis)
library(RColorBrewer)

assault <- na.omit(assault)
setDT(assault)
criminal_damage <- na.omit(criminal_damage)
setDT(criminal_damage)
deceptive_practice <- na.omit(deceptive_practice)
setDT(deceptive_practice)
narcotics <- na.omit(narcotics)
setDT(narcotics)
robbery <- na.omit(robbery)
setDT(robbery)
theft <- na.omit(theft)
setDT(theft)
violent_crime <- na.omit(violent_crime)
setDT(violent_crime)
weapons_violation <- na.omit(weapons_violation)
setDT(weapons_violation)

## MAKE CONTOUR LINES
## Assault
kde_assault <- bkde2D(assault[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_assault <- contourLines(kde_assault$x1 , kde_assault$x2 , kde_assault$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_assault<- as.factor(sapply(CL_assault, `[[`, "level"))
NLEV_assault <- length(levels(LEVS_assault))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_assault <- lapply(1:length(CL_assault), function(i)
    Polygons(list(Polygon(cbind(CL_assault[[i]]$x, CL_assault[[i]]$y))), ID=i))
spgons_assault = SpatialPolygons(pgons_assault)

## Criminal Damage
kde_cd <- bkde2D(criminal_damage[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_cd <- contourLines(kde_cd$x1 , kde_cd$x2 , kde_cd$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_cd<- as.factor(sapply(CL_cd, `[[`, "level"))
NLEV_cd <- length(levels(LEVS_cd))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_cd <- lapply(1:length(CL_cd), function(i)
    Polygons(list(Polygon(cbind(CL_cd[[i]]$x, CL_cd[[i]]$y))), ID=i))
spgons_cd = SpatialPolygons(pgons_cd)

## Deceptive Practice
kde_dp <- bkde2D(deceptive_practice[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_dp <- contourLines(kde_dp$x1 , kde_dp$x2 , kde_dp$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_dp<- as.factor(sapply(CL_dp, `[[`, "level"))
NLEV_dp <- length(levels(LEVS_dp))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_dp <- lapply(1:length(CL_dp), function(i)
    Polygons(list(Polygon(cbind(CL_dp[[i]]$x, CL_dp[[i]]$y))), ID=i))
spgons_dp = SpatialPolygons(pgons_dp)

## Narcotics
kde_narcotics <- bkde2D(narcotics[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_narcotics <- contourLines(kde_narcotics$x1 , kde_narcotics$x2 , kde_narcotics$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_narcotics <- as.factor(sapply(CL_narcotics, `[[`, "level"))
NLEV_narcotics <- length(levels(LEVS_narcotics))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_narcotics <- lapply(1:length(CL_narcotics), function(i)
    Polygons(list(Polygon(cbind(CL_narcotics[[i]]$x, CL_narcotics[[i]]$y))), ID=i))
spgons_narcotics = SpatialPolygons(pgons_narcotics)

## Robbery
kde_robbery <- bkde2D(robbery[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_robbery <- contourLines(kde_robbery$x1 , kde_robbery$x2 , kde_robbery$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_robbery <- as.factor(sapply(CL_robbery, `[[`, "level"))
NLEV_robbery <- length(levels(LEVS_robbery))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_robbery <- lapply(1:length(CL_robbery), function(i)
    Polygons(list(Polygon(cbind(CL_robbery[[i]]$x, CL_robbery[[i]]$y))), ID=i))
spgons_robbery = SpatialPolygons(pgons_robbery)

## Thefts
kde_theft <- bkde2D(theft[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_theft <- contourLines(kde_theft$x1 , kde_theft$x2 , kde_theft$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_theft <- as.factor(sapply(CL_theft, `[[`, "level"))
NLEV_theft <- length(levels(LEVS_theft))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_theft <- lapply(1:length(CL_theft), function(i)
    Polygons(list(Polygon(cbind(CL_theft[[i]]$x, CL_theft[[i]]$y))), ID=i))
spgons_theft = SpatialPolygons(pgons_theft)

## Violent Crimws
kde_vc <- bkde2D(violent_crime[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_vc <- contourLines(kde_vc$x1 , kde_vc$x2 , kde_vc$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_vc <- as.factor(sapply(CL_vc, `[[`, "level"))
NLEV_vc <- length(levels(LEVS_vc))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_vc <- lapply(1:length(CL_vc), function(i)
    Polygons(list(Polygon(cbind(CL_vc[[i]]$x, CL_vc[[i]]$y))), ID=i))
spgons_vc = SpatialPolygons(pgons_vc)

## Weapons Violation
kde_wv <- bkde2D(weapons_violation[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_wv <- contourLines(kde_wv$x1 , kde_wv$x2 , kde_wv$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_wv <- as.factor(sapply(CL_wv, `[[`, "level"))
NLEV_wv <- length(levels(LEVS_wv))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_wv <- lapply(1:length(CL_wv), function(i)
    Polygons(list(Polygon(cbind(CL_wv[[i]]$x, CL_wv[[i]]$y))), ID=i))
spgons_wv = SpatialPolygons(pgons_wv)

leaflet() %>% addTiles() %>%
    addPolygons(data = spgons_narcotics, color = brewer.pal(NLEV_narcotics, name = "YlOrRd")[LEVS_narcotics], group = "Narcotics") %>%
    addPolygons(data = spgons_assault, color = brewer.pal(NLEV_assault, name = "Reds")[LEVS_assault], group = "Assault") %>%
    addPolygons(data = spgons_cd, color = brewer.pal(NLEV_cd, name="YlGnBu")[LEVS_cd], group = "Criminal Damage") %>%
    addPolygons(data = spgons_dp, color = brewer.pal(NLEV_dp, name = "YlGn")[LEVS_dp], group = "Deceptive Practice") %>%
    addPolygons(data = spgons_robbery, color = brewer.pal(NLEV_robbery, name = "Purples")[LEVS_robbery], group = "Robbery") %>%
    addPolygons(data = spgons_theft, color = brewer.pal(NLEV_theft, name = "Oranges")[LEVS_theft], group = "Thefts") %>%
    addPolygons(data = spgons_vc, color = brewer.pal(NLEV_vc, name = "Greys")[LEVS_vc], group = "Violent Crimes") %>%
    addPolygons(data = spgons_wv, color = brewer.pal(NLEV_wv, name = "Blues")[LEVS_wv], group = "Weapons Violation") %>%
    addLabelOnlyMarkers(districts$longitude, districts$latitude, label =  districts$district, 
                      labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T), group = "Districts") %>%
    addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))
    
#addCircles(lng = narcotics$longitude, lat = narcotics$latitude,radius = .1, opacity = .4, col = "blue", group = "Points") %>%


```
```{r}
#leaflet() %>% addTiles() %>%
#    addCircles(lng = weapons_violation$longitude, lat = weapons_violation$latitude,radius = .05, opacity = 0.1, col = brewer.pal(10,name = "Reds"), group = "Narcotics") %>%
#    addLabelOnlyMarkers(districts$longitude, districts$latitude, label =  districts$district, 
#                      labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T, textsize = "15px"), group = #"Districts") %>%
#    addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))

```
## Clustering 
```{r}
chicago_crime_clustering <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
unique(chicago_crime_clustering$primary_type)

library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))

types <- unique(chicago_crime_clustering$primary_type)
chicago_crime_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
chicago_crime_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#chicago_crime_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(chicago_crime_clustering$location_description))
chicago_crime_clustering$district <- as.numeric(chicago_crime_clustering$district)
test <- chicago_crime_clustering
#Normalization of variables
library(RSNNS)

train_set <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)

#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,] 
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]

#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]

## Dendograms
library(tidyverse)      #data manipulation and visualization
library(class)          # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")

distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
     main = "Agglomerative, complete linkages")

library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
                  "wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
  row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
  stats.names[i] <- paste("Test", i-1)
  
  for(j in seq_along(clust.assess)){
    output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
    
  }
  
  for(d in 1:k) {
    cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
    dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
    cluster.sizes[d, i]
    
  }
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive

stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl

#confusionMatrix(train_small, )
```
```{r}
library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))
#narcotics <- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#narcotics_tr <- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
narcotics_clustering <- subset(narcotics, select=-c(location_description))
narcotics_clustering_tr <- subset(narcotics_tr, select=-c(location_description))

types <- unique(chicago_crime_clustering$primary_type)
narcotics_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
narcotics_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#narcotics_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(narcotics_clustering$location_description))
narcotics_clustering$district <- as.numeric(narcotics_clustering$district)
test <- narcotics_clustering
#Normalization of variables
library(RSNNS)

train_set <- narcotics_clustering_tr
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)

#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,] 
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]

#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]

## Dendograms
library(tidyverse)      #data manipulation and visualization
library(class)          # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")

distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
     main = "Agglomerative, complete linkages")

library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
                  "wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
  row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
  stats.names[i] <- paste("Test", i-1)
  
  for(j in seq_along(clust.assess)){
    output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
    
  }
  
  for(d in 1:k) {
    cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
    dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
    cluster.sizes[d, i]
    
  }
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive

stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl

#confusionMatrix(train_small, )
```

```{r}
library("ggplot2")
library("reshape2")
library("purrr")
library("dplyr")
# let's start with a dendrogram
library("dendextend")
dendro <- as.dendrogram(agg)
dendro.col <- dendro %>%
  set("branches_k_color", k = 8, value =   c("darkslategray", "darkslategray4", "darkslategray3", "gold3", "darkcyan", "cyan3", "gold3")) %>%
  set("branches_lwd", 0.6) %>%
  set("labels_colors", 
      value = c("darkslategray")) %>% 
  set("labels_cex", 0.5)
ggd1 <- as.ggdend(dendro.col)
ggplot(ggd1, theme = theme_minimal()) +
  labs(x = "Num. observations", y = "Height", title = "Dendrogram, k = 8")



```
## Arboles de Decision
```{r}
## c50

library(dplyr)
library(MASS)        # for obtaining data
library(tidyverse)  # for data processing
library(rpart)      # for CART decision tree
library(rpart.plot) # for plotting CART
library(caret)      # for confusion matrix and more
library(rsample)    # for data splitting
library(data.table)
library(C50)


#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)

chicago_crime_trees <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_trees_tr <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))

library(dplyr)
chicago_crime_trees %>% mutate_if(is.factor, as.character) -> chicago_crime_trees
chicago_crime_trees_tr %>% mutate_if(is.factor, as.character) -> chicago_crime_trees_tr

chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "THEFT"] <- "TH"

chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "THEFT"] <- "TH"

#train_c50<- chicago_crime_trees_tr
#test_c50<- chicago_crime_trees

crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$primary_type <- as.factor(train_c50$primary_type)
test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(primary_type  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.2))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$primary_type)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$primary_type),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$primary_type)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$primary_type),
      "correct classified cases from", length(pred_train))


```
```{r}
#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.1))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(arrest  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.9))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$arrest)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$arrest),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$arrest)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$arrest),
      "correct classified cases from", length(pred_train))
```


```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#assault <- subset(assault, select=-c(location_description))
#assault_tr <- subset(assault_tr, select=-c(location_description))

assault_tr <- subset(assault_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )
assault <- subset(assault, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )

#train_c50<- subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(assault, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.0001))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#criminal_damage <- subset(criminal_damage, select=-c(location_description))
#criminal_damage_tr <- subset( criminal_damage_tr, select=-c(location_description))
criminal_damage_tr <- subset(criminal_damage_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18"  | district == "9" | district=="20" | district == "25" | district == "11")
criminal_damage <- subset(criminal_damage, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18"  | district == "9" | district=="20" | district == "25" | district == "11")

train_c50<- subset(criminal_damage_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(criminal_damage, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)

#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.001))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#deceptive_practice <- subset(deceptive_practice, select=-c(location_description))
#deceptive_practice_tr <- subset(deceptive_practice_tr, select=-c(location_description))

deceptive_practice_tr <- subset(deceptive_practice_tr, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20"  | district == "9" | district=="21" | district == "25" | district == "11" )

deceptive_practice <- subset(deceptive_practice, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20"  | district == "9" | district=="21" | district == "25" | district == "11" )

#train_c50<- subset(deceptive_practice_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(deceptive_practice_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.5))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#narcotics <- subset(narcotics, select=-c(location_description))
#narcotics_tr <- subset(narcotics_tr, select=-c(location_description))
narcotics_tr <- subset(narcotics_tr, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")
narcotics <- subset(narcotics, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")

#train_c50<- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(narcotics_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
## Creating a training and test datasets
set.seed(1234)
#robbery <- subset(robbery, select=-c(location_description))
#robbery_tr <- subset(robbery_tr, select=-c(location_description))
robbery_tr <- subset(robbery_tr, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")
robbery <- subset(robbery, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")

#robbery$location_description <- gsub("PARKING LOT","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKING LOT","PARKING",robbery_tr$location_description)
#robbery$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery_tr$location_description)

train_c50<- subset(robbery_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(robbery, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
train_c50$location_description <- factor(train_c50$location_description)
test_c50$location_description <- factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~., data=train_c50, control = C5.0Control(noGlobalPruning = TRUE, CF= 0.000001))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
## Creating a training and test datasets
set.seed(1234)
theft <- subset(theft, select=-c(location_description))
theft_tr <- subset(theft_tr, select=-c(location_description))
theft_tr <- subset(theft_tr, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")
theft <- subset(theft, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")

train_c50<- subset(theft_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(theft, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
set.seed(1234)
#violent_crime <- subset(violent_crime, select=-c(location_description))
#violent_tr_crime <- subset(violent_tr_crime, select=-c(location_description))
violent_tr_crime <- subset(violent_tr_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20"  | district == "10" | district=="21" | district == "25" | district == "2")
violent_crime <- subset(violent_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20"  | district == "10" | district=="21" | district == "25" | district == "2")

train_c50<- subset(violent_tr_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(violent_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, trials = 10,control = C5.0Control(noGlobalPruning = FALSE, CF= 0.000001))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL,trial=9)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```

```{r}
set.seed(1234)
#weapons_violation <- subset(weapons_violation, select=-c(location_description))
#weapons_violation_tr <- subset(weapons_violation_tr, select=-c(location_description))
weapons_violation_tr <- subset(weapons_violation_tr, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")
weapons_violation <- subset(weapons_violation, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")

train_c50<- subset(weapons_violation_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(weapons_violation, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
library(MASS)       # for obtaining data
library(tidyverse)  # for data processing
library(rpart)      # for CART decision tree
library(rpart.plot) # for plotting CART
library(caret)      # for confusion matrix and more
library(rsample)    # for data splitting
library(randomForest)  # For bagging and randomforest
library(ggpubr)

## Solo se pueden tener 53 valores categoricos unicos diferentes. Por esta razon hay que hacer regresion con solo estas variables.
chicago_crime_ensemble <- subset(chicago_crime_subset, select=-c(case_number,block,description,location_description,ward,latitude,longitude))
chicago_crime_ensemble_tr <- subset(chicago_crime_subset_tr, select=-c(case_number,location_description,description,block,ward,latitude,longitude))

chicago_crime_ensemble$primary_type <- as.numeric(chicago_crime_ensemble$primary_type)
chicago_crime_ensemble$arrest <- as.factor(chicago_crime_ensemble$arrest)
chicago_crime_ensemble$arrest <- as.numeric(chicago_crime_ensemble$arrest)
#chicago_crime_ensemble$primary_type <- as.factor(chicago_crime_ensemble$primary_type)
#chicago_crime_ensemble$description <- as.factor(chicago_crime_ensemble$description)
#chicago_crime_ensemble$description <- factor(chicago_crime_ensemble$description)
#chicago_crime_ensemble$primary_type <- factor(chicago_crime_ensemble$primary_type)

chicago_crime_ensemble_tr$primary_type <- as.numeric(chicago_crime_ensemble_tr$primary_type)
chicago_crime_ensemble_tr$arrest <- as.factor(chicago_crime_ensemble_tr$arrest)
chicago_crime_ensemble_tr$arrest <- as.numeric(chicago_crime_ensemble_tr$arrest)
#chicago_crime_ensemble_tr$primary_type <- as.factor(chicago_crime_ensemble_tr$primary_type)
#chicago_crime_ensemble_tr$description <- as.factor(chicago_crime_ensemble_tr$description)
#chicago_crime_ensemble_tr$description <- factor(chicago_crime_ensemble_tr$description)
#chicago_crime_ensemble_tr$primary_type <- factor(chicago_crime_ensemble_tr$primary_type)

chicago_crime_ensemble_tr <- na.omit(chicago_crime_ensemble_tr)
chicago_crime_ensemble <- na.omit(chicago_crime_ensemble)

#RF_split<- initial_split(chicago_crime, prop=0.8)
RF_train<- chicago_crime_ensemble_tr[1:10000,]
RF_test<- chicago_crime_ensemble[1:1000,]

bagging_model<- randomForest(formula=primary_type  ~ ., data=RF_train,mtry=4)  #4 from 13 predictors will be selected

#Result of random forest model
print(bagging_model)

```
## REDES NEURONALES
```{r}
## REDES
chicago_crime_nn <- subset(chicago_crime_subset, select=-c(case_number,block,ward,day,latitude,longitude))
chicago_crime_nn_tr <- subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- chicago_crime_nn_tr$primary_type
chicago_crime_nn_tr <- subset(chicago_crime_nn_tr, select=-c(primary_type))

chicago_crime_nn_tr$arrest <- as.factor(chicago_crime_nn_tr$arrest)
chicago_crime_nn_tr$arrest <- as.numeric(chicago_crime_nn_tr$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr$district <- as.numeric(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
chicago_crime_nn_tr$description <- as.factor(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$description <- as.numeric(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$location_description <- as.factor(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$location_description <- as.numeric(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$month <- as.numeric(chicago_crime_nn_tr$month)
#chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)

#Scaling original data
set.seed(7)


# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
train <- chicago_crime_nn_tr[index,]
test <- chicago_crime_nn_tr[-index,]

train <- as.matrix(train)
test <- as.matrix(test)

train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train[1:100000,], grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}
par(mfrow=c(2,3))
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
     main=colnames(getCodes(crime.som, 1))[1],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
     main=colnames(getCodes(crime.som, 1))[2],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
     main=colnames(getCodes(crime.som, 1))[3],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,4],
     main=colnames(getCodes(crime.som, 1))[4],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,5],
     main=colnames(getCodes(crime.som, 1))[5],
     palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,6],
#     main=colnames(getCodes(crime.som, 1))[6],
#     palette.name=coolBlueHOtRed)

# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}

par(mfrow=c(5,3))
for (j in 1:ncol(train)) {
plot(crime.som, type="property", property=crime.som$codes[[1]][,j],
     palette.name=coolBlueHotRed,
     main=colnames(train)[j], cex=0.5)
}

#som.prediction <- predict(crime.som, test)
```
```{r}
#Clustering patterns in the map
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = rainbow(groups)[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)
```
```{r}

#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

# Plotting classes in neurons
plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
## REDES
chicago_crime_nn2 <- subset(chicago_crime_subset, select=-c(description,case_number,block,day,latitude,longitude))
chicago_crime_nn_tr2 <- subset(chicago_crime_subset_tr, select=-c(description,case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train2 <- chicago_crime_nn_tr2$primary_type
chicago_crime_nn_tr2 <- subset(chicago_crime_nn_tr2, select=-c(primary_type))

target_test2 <- chicago_crime_nn2$primary_type
chicago_crime_nn2 <- subset(chicago_crime_nn2, select=-c(primary_type))

chicago_crime_nn_tr2$arrest <- as.factor(chicago_crime_nn_tr2$arrest)
chicago_crime_nn_tr2$arrest <- as.numeric(chicago_crime_nn_tr2$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr2$district <- as.numeric(chicago_crime_nn_tr2$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr2$description <- as.factor(chicago_crime_nn_tr2$description)
#chicago_crime_nn_tr2$description <- as.numeric(chicago_crime_nn_tr2$description)
chicago_crime_nn_tr2$location_description <- as.factor(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$location_description <- as.numeric(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$month <- as.numeric(chicago_crime_nn_tr2$month)
chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)

#Scaling original data
set.seed(7)

# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr2), round(0.75*nrow(chicago_crime_nn_tr2)))
train <- chicago_crime_nn_tr2[index,]
test <- chicago_crime_nn_tr2[-index,]

train <- as.matrix(train)
test <- as.matrix(test)

train_label<-target_train2[index]
test_label<-target_train2[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train[1:100000,], grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}
par(mfrow=c(2,3))
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
     main=colnames(getCodes(crime.som, 1))[1],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
     main=colnames(getCodes(crime.som, 1))[2],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
     main=colnames(getCodes(crime.som, 1))[3],
     palette.name=coolBlueHOtRed)


# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}

```
```{r}
#Clustering patterns in the map
library(RColorBrewer)
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = brewer.pal(groups, name="YlGnBu")[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)
```
```{r}
#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
## REDES
narcotics_nn <- subset(narcotics_tr, select=-c(case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- narcotics_nn$district
narcotics_nn <- subset(narcotics_nn, select=-c(district))

narcotics_nn$arrest <- as.factor(narcotics_nn$arrest)
narcotics_nn$arrest <- as.numeric(narcotics_nn$arrest)
narcotics_nn$primary_type <- as.factor(narcotics_nn$primary_type)
narcotics_nn$primary_type <- as.numeric(narcotics_nn$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
narcotics_nn$description <- as.factor(narcotics_nn$description)
narcotics_nn$description <- as.numeric(narcotics_nn$description)
narcotics_nn$location_description <- as.factor(narcotics_nn$location_description)
narcotics_nn$location_description <- as.numeric(narcotics_nn$location_description)
narcotics_nn$month <- as.numeric(narcotics_nn$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(narcotics_nn), round(0.75*nrow(narcotics_nn)))
train <- narcotics_nn[index,]
test <- narcotics_nn[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train, grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

```
```{r}
#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
## REDES
crimen <- subset(weapons_violation_tr, select=-c(case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- crimen$district
crimen <- subset(crimen, select=-c(district))

crimen$arrest <- as.factor(crimen$arrest)
crimen$arrest <- as.numeric(crimen$arrest)
crimen$primary_type <- as.factor(crimen$primary_type)
crimen$primary_type <- as.numeric(crimen$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
crimen$description <- as.factor(crimen$description)
crimen$description <- as.numeric(crimen$description)
crimen$location_description <- as.factor(crimen$location_description)
crimen$location_description <- as.numeric(crimen$location_description)
crimen$month <- as.numeric(crimen$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(crimen), round(0.75*nrow(crimen)))
train <- crimen[index,]
test <- crimen[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")

#training the map
crime.som <- som(train, grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
library(RColorBrewer)
groups<-23
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = heat.colors(groups)[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)
```



